diff --git a/.env.example b/.env.example index ef8f6bc6..b089ed59 100644 --- a/.env.example +++ b/.env.example @@ -11,70 +11,114 @@ # python scripts/run_text_only.py --record-id 1.1.2 # ============================================== -# Required: API Keys +# API Configs # ============================================== -# --- ElevenLabs (user simulator) --- +# --- ElevenLabs --- +#i ElevenLabs API key for the user simulator. +#d secret ELEVENLABS_API_KEY=your_elevenlabs_api_key_here -# ElevenLabs Conversational AI agent IDs for user simulation. -# Create a Conversational AI agent at https://elevenlabs.io/conversational-ai and copy its agent ID. -# You need two agents: one with a female voice (persona 1) and one with a male voice (persona 2). -# These are used to simulate different caller personas during benchmark conversations. +# --- LLM / Text Judge --- +#i OpenAI key for assistant LLM and text judge metrics. +#d secret +OPENAI_API_KEY=your_openai_api_key_here -EVA_DEFAULT_USER_F=your_elevenlabs_agent_id_for_default_user_f -EVA_DEFAULT_USER_M=your_elevenlabs_agent_id_for_default_user_m +# --- Audio Judge (Gemini via GCP) --- +#i Path to GCP service-account JSON for Gemini audio judge metrics. +#d path +GOOGLE_APPLICATION_CREDENTIALS=path/to/your/service-account-credentials.json -# --- LLM (assistant + text judge metrics) --- -OPENAI_API_KEY=your_openai_api_key_here +# --- Faithfulness Metric (Claude via Bedrock) --- +#i AWS access key for Claude via Bedrock (faithfulness metric). +#d secret +AWS_ACCESS_KEY_ID=your_aws_access_key_id_here + +#i AWS secret access key. +#d secret +AWS_SECRET_ACCESS_KEY=your_aws_secret_access_key_here + +# --- Alternative providers (optional) --- +# If you only have an OpenAI key you can skip AWS and set JUDGE_MODEL=gpt-5.2 +# to override all text judges. Audio judge metrics still require Gemini. + +#i Azure OpenAI key (alternative to direct OpenAI). +#d secret +#v AZURE_OPENAI_API_KEY=your_azure_openai_api_key_here + +#i Azure OpenAI endpoint URL. +#d string +#v AZURE_OPENAI_ENDPOINT=https://your-resource.openai.azure.com/ + +#i Google API key (alternative to service-account credentials for Gemini). +#d secret +#v GOOGLE_API_KEY=your_google_api_key_here + +# ============================================== +# Voice Pipeline +# ============================================== -# --- STT/TTS (voice pipeline) --- -# The API key and model for your chosen provider must be passed via the *_PARAMS JSON. +# Pipeline mode is controlled by the UI radio (LLM / S2S / AudioLLM). +# The #x conditions below ensure each variable is only active for the right mode. -# STT provider: assemblyai | cartesia | deepgram | deepgram-flux | elevenlabs | nvidia | nvidia-baseten | openai +# --- LLM mode: STT --- +#i STT provider for the voice pipeline. +#d enum +#e assemblyai,cartesia,deepgram,deepgram-flux,elevenlabs,nvidia,nvidia-baseten,openai +#x pipeline_mode=LLM EVA_MODEL__STT=cartesia -# Must include "api_key" and "model" for your chosen provider: + +#i STT provider parameters. Must include "api_key" and "model". +#d json_object +#x pipeline_mode=LLM EVA_MODEL__STT_PARAMS='{"api_key": "your_cartesia_api_key", "model": "ink-whisper"}' -# TTS provider: cartesia | chatterbox | elevenlabs | gemini | kokoro | nvidia-baseten | openai | xtts +# --- LLM mode: TTS --- +#i TTS provider for the voice pipeline. +#d enum +#e cartesia,chatterbox,elevenlabs,gemini,kokoro,nvidia-baseten,openai,xtts +#x pipeline_mode=LLM EVA_MODEL__TTS=cartesia -# Must include "api_key" and "model" for your chosen provider: -EVA_MODEL__TTS_PARAMS='{"api_key": "your_cartesia_api_key", "model": "sonic"}' -# For round-robin load balancing, use "urls" instead of "url": -# EVA_MODEL__TTS_PARAMS='{"api_key": "...", "model": "sonic", "urls": ["http://server1/v1", "http://server2/v1"]}' - -# --- Metrics judge models --- -# Google credentials (audio judge metrics default to Gemini) -GOOGLE_APPLICATION_CREDENTIALS=path/to/your/service-account-credentials.json -# AWS credentials (faithfulness metric defaults to Claude via Bedrock) -AWS_ACCESS_KEY_ID=your_aws_access_key_id_here -AWS_SECRET_ACCESS_KEY=your_aws_secret_access_key_here +#i TTS provider parameters. Must include "api_key" and "model". Use "urls" for round-robin load balancing. +#d json_object +#x pipeline_mode=LLM +EVA_MODEL__TTS_PARAMS='{"api_key": "your_cartesia_api_key", "model": "sonic"}' -# If you only have an OpenAI key, you can skip the AWS credentials above and -# override all text judge models (including faithfulness) to use OpenAI instead -# (results may be less accurate): -# JUDGE_MODEL=gpt-5.2 -# Audio judge metrics (agent_speech_fidelity, user_speech_fidelity) still require -# Gemini. To skip them, run only text-based metrics, e.g.: -# EVA_METRICS=task_completion,faithfulness,conciseness,turn_taking +# --- S2S mode --- +#i Speech-to-speech model name. +#d string +#x pipeline_mode=S2S +#v EVA_MODEL__S2S=openai + +#i Speech-to-speech model parameters. +#d json_object +#x pipeline_mode=S2S +#v EVA_MODEL__S2S_PARAMS='{"model": "gpt-realtime-mini", "api_key": ""}' + +# --- AudioLLM mode --- +#i Audio-input LLM model name. +#d string +#x pipeline_mode=AudioLLM +#v EVA_MODEL__AUDIO_LLM= + +#i Audio-input LLM model parameters. +#d json_object +#x pipeline_mode=AudioLLM +#v EVA_MODEL__AUDIO_LLM_PARAMS='{"url": "", "api_key": ""}' + +# --- Framework (S2S / AudioLLM) --- +#i Base framework for S2S or AudioLLM pipelines. +#d enum +#e pipecat,openai_realtime,gemini_live,elevenlabs +#v EVA_FRAMEWORK=openai_realtime # ============================================== -# Required: Model Deployments +# LiteLLM Deployments # ============================================== -# -# EVA_MODEL_LIST: JSON array of LiteLLM Router deployments. -# - model_name: alias your code uses (e.g., "gpt-5.2") -# - litellm_params.model: provider-specific identifier (e.g., "openai/gpt-4o") -# - Use "os.environ/VAR_NAME" syntax to reference other env vars -# -# EVA needs at minimum: -# 1. An LLM for the assistant (matches EVA_MODEL__LLM below) -# 2. Gemini for audio judge metrics -# 3. Claude (Bedrock) for the faithfulness metric -# -# See docs/llm_configuration.md for more provider examples and load balancing. +#i LiteLLM Router deployments. Use "os.environ/VAR_NAME" to reference other env vars. +#d json_deployment_list EVA_MODEL_LIST='[ { "model_name": "gpt-5.2", @@ -106,222 +150,233 @@ EVA_MODEL_LIST='[ } ]' -# --- Optional: additional model deployments --- -# Uncomment and add to EVA_MODEL_LIST above as needed. -# -# Azure OpenAI (alternative to direct OpenAI): -# { -# "model_name": "gpt-5.2", -# "litellm_params": { -# "model": "azure/gpt-5.2", -# "api_key": "os.environ/AZURE_OPENAI_API_KEY", -# "api_base": "https://your-resource.openai.azure.com", -# "max_parallel_requests": 5 -# }, -# "model_info": {"base_model": "gpt-5.2"} -# } -# -# OpenAI Responses API (for multi-turn encrypted reasoning on o-series / gpt-5.x models): -# Add "use_responses_api": true at the top level of the deployment (not inside litellm_params). -# Also set "reasoning_effort" inside litellm_params to enable reasoning. -# { -# "model_name": "gpt-5.2", -# "litellm_params": { -# "model": "azure/gpt-5.2", -# "api_key": "os.environ/AZURE_OPENAI_API_KEY", -# "api_base": "https://your-resource.openai.azure.com", -# "reasoning_effort": "low", -# "max_parallel_requests": 5 -# }, -# "use_responses_api": true -# } -# -# Self-hosted model (e.g., vLLM, NVIDIA NIM): -# { -# "model_name": "my-model", -# "litellm_params": { -# "model": "openai/my-model-name", -# "api_key": "os.environ/MY_MODEL_KEY", -# "api_base": "http://my-server:8000/v1", -# "max_parallel_requests": 5 -# } -# } -# -# Load balancing (multiple endpoints for the same model): -# { -# "model_name": "my-model", -# "litellm_params": {"model": "openai/my-model", "api_base": "http://server1:8000/v1", ...} -# }, -# { -# "model_name": "my-model", -# "litellm_params": {"model": "openai/my-model", "api_base": "http://server2:8000/v1", ...} -# } +#i LLM model alias for the assistant. Must match a model_name in EVA_MODEL_LIST. +#d enum +#x pipeline_mode=LLM +EVA_MODEL__LLM=gpt-5.2 # ============================================== -# Required: Framework Configuration +# Framework & Runtime # ============================================== -# Domain name — determines dataset, agent config, and scenario paths: -# data/{domain}_dataset.jsonl -# configs/agents/{domain}_agent.yaml -# data/{domain}_scenarios/ -# The included sample domain is "airline". -# EVA_DOMAIN=airline +#i Domain determines dataset, agent config, and scenario paths (data/{domain}_dataset.jsonl etc). +#d enum +#e airline,itsm,medical_hr +#v EVA_DOMAIN=airline -# LLM model name — must match a model_name in EVA_MODEL_LIST above. -EVA_MODEL__LLM=gpt-5.2 +#i Maximum number of concurrent conversations. +#d int +#r 1,100,1 +#v EVA_MAX_CONCURRENT_CONVERSATIONS=1 -# ============================================== -# Optional: Alternative LLM Provider Keys -# ============================================== +#i Conversation timeout in seconds. +#d int +#r 30,10000,10 +#v EVA_CONVERSATION_TIMEOUT_SECONDS=360 -# Azure OpenAI (alternative to direct OpenAI) -# AZURE_OPENAI_API_KEY=your_azure_openai_api_key_here -# AZURE_OPENAI_ENDPOINT=https://your-resource.openai.azure.com/ +#i Maximum rerun attempts for failed records. +#d int +#r 0,20,1 +#v EVA_MAX_RERUN_ATTEMPTS=3 -# Google API key (alternative to service account credentials for Gemini) -# GOOGLE_API_KEY=your_google_api_key_here - -# ============================================== -# Optional: Speech-to-Speech / Audio-LLM Configuration -# ============================================== -# Only needed if benchmarking speech-to-speech models. +#i Output directory for results. +#d path +#v EVA_OUTPUT_DIR=output -# Base framework (pipecat, openai_realtime, gemini_live) -# EVA_FRAMEWORK=openai_realtime +#i Starting port for WebSocket servers. +#d int +#r 1024,65000,1 +#v EVA_BASE_PORT=10000 -# EVA_MODEL__S2S=openai -# EVA_MODEL__S2S_PARAMS='{"model": "gpt-realtime-mini", "api_key": ""}' +#i Number of ports in the pool. +#d int +#r 10,500,1 +#v EVA_PORT_POOL_SIZE=150 -# EVA_MODEL__AUDIO_LLM= -# EVA_MODEL__AUDIO_LLM_PARAMS='{"url": "", "api_key": ""}' +#i Comma-separated metric names to run. Leave empty to run all metrics. +#d csv_list +#v EVA_METRICS= # ============================================== -# Optional: Execution Settings +# Turn Detection & VAD # ============================================== -# Maximum number of concurrent conversations (1-100, default: 1) -# EVA_MAX_CONCURRENT_CONVERSATIONS=1 +# Leave all of these inactive to use smart defaults. -# Conversation timeout in seconds (30-10000, default: 360) -# EVA_CONVERSATION_TIMEOUT_SECONDS=360 +#i Turn start strategy: when to consider the user has started speaking. +#d enum +#e vad,transcription,external +#v EVA_MODEL__TURN_START_STRATEGY=vad -# Maximum number of rerun attempts for failed records (0-20, default: 3) -# EVA_MAX_RERUN_ATTEMPTS=3 +#i Turn start strategy parameters (JSON). +#d json_object +#v EVA_MODEL__TURN_START_STRATEGY_PARAMS='{}' -# Output directory for results (default: output) -# EVA_OUTPUT_DIR=output +#i Turn stop strategy: when to consider the user has finished speaking. +#d enum +#e turn_analyzer,speech_timeout,external +#v EVA_MODEL__TURN_STOP_STRATEGY=turn_analyzer -# Starting port for WebSocket servers (1024-65000, default: 10000) -# EVA_BASE_PORT=10000 +#i Turn stop strategy parameters. For speech_timeout: {"user_speech_timeout": 0.8}. +#d json_object +#v EVA_MODEL__TURN_STOP_STRATEGY_PARAMS='{}' -# Number of ports in the pool (10-500, default: 150) -# EVA_PORT_POOL_SIZE=150 +#i VAD (Voice Activity Detection) analyzer. +#d enum +#e silero,none +#v EVA_MODEL__VAD=silero -# Comma-separated list of metrics to run (empty = no metrics, default: all metrics) -# EVA_METRICS= - -# Debug mode: run only 1 record regardless of dataset size (true | false, default: false) -# EVA_DEBUG=true - -# Comma-separated list of specific record IDs to run (empty = run all) -# EVA_RECORD_IDS=1.2.1,1.2.2,1.3.1 - -# Logging level (DEBUG | INFO | WARNING | ERROR | CRITICAL, default: INFO) -# EVA_LOG_LEVEL=DEBUG +#i VAD parameters. Keys: confidence (0-1), start_secs, stop_secs, min_volume (0-1). +#d json_object +#v EVA_MODEL__VAD_PARAMS='{"start_secs": 0.2, "stop_secs": 0.2, "min_volume": 0.6, "confidence": 0.7}' # ============================================== -# Optional: Turn Detection & VAD Configuration +# User Config # ============================================== -# Fine-tune user turn detection and voice activity detection. -# Leave commented to use smart defaults. - -# User turn start strategy: vad | transcription | external -# - vad: Start turn when VAD detects speech (default) -# - transcription: Start turn when STT produces transcription -# - external: Delegate to external service (e.g., Deepgram Flux) -# EVA_MODEL__TURN_START_STRATEGY=vad - -# User turn start strategy parameters (JSON) -# EVA_MODEL__TURN_START_STRATEGY_PARAMS='{}' - -# User turn stop strategy: turn_analyzer | speech_timeout | external -# - turn_analyzer: Use smart turn analyzer to detect natural turn end (default) -# - speech_timeout: Stop after fixed silence duration -# - external: Delegate to external service -# EVA_MODEL__TURN_STOP_STRATEGY=turn_analyzer - -# User turn stop strategy parameters (JSON) -# For speech_timeout: {"user_speech_timeout": 0.8} -# For turn_analyzer: automatically uses smart turn detection -# EVA_MODEL__TURN_STOP_STRATEGY_PARAMS='{}' - -# Note: For services with built-in turn detection (e.g., Deepgram Flux), set both to 'external': -# EVA_MODEL__TURN_START_STRATEGY=external -# EVA_MODEL__TURN_STOP_STRATEGY=external - -# VAD (Voice Activity Detection) analyzer: silero | none -# EVA_MODEL__VAD=silero - -# VAD parameters (JSON) -# - confidence: Minimum confidence threshold (0.0-1.0, default: 0.7) -# - start_secs: Duration to wait before confirming voice start (default: 0.2) -# - stop_secs: Duration to wait before confirming voice stop (default: 0.2) -# - min_volume: Minimum audio volume threshold (0.0-1.0, default: 0.6) -# EVA_MODEL__VAD_PARAMS='{"start_secs": 0.2, "stop_secs": 0.2, "min_volume": 0.6, "confidence": 0.7}' -# ============================================== -# Optional: Perturbations -# ============================================== -# Perturb the simulated user to stress-test the assistant under realistic -# conditions. Four independent axes, all optional: -# -# 1. background_noise — ambient audio mixed into user speech -# 2. accent — swaps the ElevenLabs user agent to an accented voice -# 3. behavior — swaps the agent + modifies persona prompt -# 4. connection_degradation — stacks VoIP artifacts (codec, packet loss, gain jitter) -# +# --- Default user simulator agents --- +#i ElevenLabs agent ID for the default female-voice user persona. +#d string +EVA_DEFAULT_USER_F=your_elevenlabs_agent_id_for_default_user_f + +#i ElevenLabs agent ID for the default male-voice user persona. +#d string +EVA_DEFAULT_USER_M=your_elevenlabs_agent_id_for_default_user_m + +# --- Perturbations --- # accent and behavior are MUTUALLY EXCLUSIVE (each claims the agent ID slot). # background_noise and connection_degradation can stack with either. + # --- Background noise --- -# Options: airport_gate | baby_crying | background_music | bad_connection_static | -# coffee_shop | loud_construction | nyc_street | road_noise -# Requires assets in assets/noise/. Download with: -# python scripts/download_noise_assets.py -# EVA_PERTURBATION__BACKGROUND_NOISE=coffee_shop -# -# Signal-to-noise ratio in dB (higher = cleaner user speech; default: 15) -# EVA_PERTURBATION__SNR_DB=15 +# Requires assets in assets/noise/. Download with: python scripts/download_noise_assets.py +#i Ambient noise to mix into user speech. +#d enum +#e airport_gate,baby_crying,background_music,bad_connection_static,coffee_shop,loud_construction,nyc_street,road_noise +#v EVA_PERTURBATION__BACKGROUND_NOISE=coffee_shop + +#i Signal-to-noise ratio in dB. Higher = cleaner user speech. +#d float +#r 0,40,1 +#v EVA_PERTURBATION__SNR_DB=15 # --- Connection degradation --- -# G.711 codec quantisation + gaussian static + 3% packet loss + random gain. -# No assets required. -# EVA_PERTURBATION__CONNECTION_DEGRADATION=false +#i Apply G.711 codec + gaussian static + 3% packet loss + random gain. +#d bool +#v EVA_PERTURBATION__CONNECTION_DEGRADATION=false + +# --- Accent (mutually exclusive with Behavior) --- +#i Accent to apply to the user simulator. Requires matching agent IDs below. +#d enum +#e french,indian,spanish,chinese +#x perturbation_mode=Accent +#v EVA_PERTURBATION__ACCENT=french + +# --- Accent agent IDs --- +#i ElevenLabs agent ID — French accent, female voice. +#d string +#x perturbation_mode=Accent +#x EVA_PERTURBATION__ACCENT=french +#v EVA_FRENCH_ACCENT_USER_F= + +#i ElevenLabs agent ID — French accent, male voice. +#d string +#x perturbation_mode=Accent +#x EVA_PERTURBATION__ACCENT=french +#v EVA_FRENCH_ACCENT_USER_M= + +#i ElevenLabs agent ID — Indian accent, female voice. +#d string +#x perturbation_mode=Accent +#x EVA_PERTURBATION__ACCENT=indian +#v EVA_INDIAN_ACCENT_USER_F= + +#i ElevenLabs agent ID — Indian accent, male voice. +#d string +#x perturbation_mode=Accent +#x EVA_PERTURBATION__ACCENT=indian +#v EVA_INDIAN_ACCENT_USER_M= + +#i ElevenLabs agent ID — Spanish accent, female voice. +#d string +#x perturbation_mode=Accent +#x EVA_PERTURBATION__ACCENT=spanish +#v EVA_SPANISH_ACCENT_USER_F= + +#i ElevenLabs agent ID — Spanish accent, male voice. +#d string +#x perturbation_mode=Accent +#x EVA_PERTURBATION__ACCENT=spanish +#v EVA_SPANISH_ACCENT_USER_M= + +#i ElevenLabs agent ID — Chinese accent, female voice. +#d string +#x perturbation_mode=Accent +#x EVA_PERTURBATION__ACCENT=chinese +#v EVA_CHINESE_ACCENT_USER_F= + +#i ElevenLabs agent ID — Chinese accent, male voice. +#d string +#x perturbation_mode=Accent +#x EVA_PERTURBATION__ACCENT=chinese +#v EVA_CHINESE_ACCENT_USER_M= + +# --- Behavior (mutually exclusive with Accent) --- +#i Behavior persona for the user simulator. Requires matching agent IDs below. +#d enum +#e aggressive_impatient,elderly_slow,forgetful_disorganized +#x perturbation_mode=Behavior +#v EVA_PERTURBATION__BEHAVIOR=forgetful_disorganized + +# --- Behavior agent IDs --- +#i ElevenLabs agent ID — Aggressive/impatient persona, female voice. +#d string +#x perturbation_mode=Behavior +#x EVA_PERTURBATION__BEHAVIOR=aggressive_impatient +#v EVA_AGGRESSIVE_IMPATIENT_USER_F= + +#i ElevenLabs agent ID — Aggressive/impatient persona, male voice. +#d string +#x perturbation_mode=Behavior +#x EVA_PERTURBATION__BEHAVIOR=aggressive_impatient +#v EVA_AGGRESSIVE_IMPATIENT_USER_M= + +#i ElevenLabs agent ID — Elderly/slow persona, female voice. +#d string +#x perturbation_mode=Behavior +#x EVA_PERTURBATION__BEHAVIOR=elderly_slow +#v EVA_ELDERLY_SLOW_USER_F= + +#i ElevenLabs agent ID — Elderly/slow persona, male voice. +#d string +#x perturbation_mode=Behavior +#x EVA_PERTURBATION__BEHAVIOR=elderly_slow +#v EVA_ELDERLY_SLOW_USER_M= + +#i ElevenLabs agent ID — Forgetful/disorganized persona, female voice. +#d string +#x perturbation_mode=Behavior +#x EVA_PERTURBATION__BEHAVIOR=forgetful_disorganized +#v EVA_FORGETFUL_DISORGANIZED_USER_F= + +#i ElevenLabs agent ID — Forgetful/disorganized persona, male voice. +#d string +#x perturbation_mode=Behavior +#x EVA_PERTURBATION__BEHAVIOR=forgetful_disorganized +#v EVA_FORGETFUL_DISORGANIZED_USER_M= -# --- Accent (mutually exclusive with behavior) --- -# Options: french | indian | spanish | chinese -# EVA_PERTURBATION__ACCENT=french -# -# Accent-specific ElevenLabs agent IDs. Pattern: EVA_{ACCENT}_ACCENT_USER_{F|M} -# Only the pair matching EVA_PERTURBATION__ACCENT needs to be set. -# EVA_FRENCH_ACCENT_USER_F= -# EVA_FRENCH_ACCENT_USER_M= -# EVA_INDIAN_ACCENT_USER_F= -# EVA_INDIAN_ACCENT_USER_M= -# EVA_SPANISH_ACCENT_USER_F= -# EVA_SPANISH_ACCENT_USER_M= -# EVA_CHINESE_ACCENT_USER_F= -# EVA_CHINESE_ACCENT_USER_M= - -# --- Behavior (mutually exclusive with accent) --- -# Options: aggressive_impatient | elderly_slow | forgetful_disorganized -# EVA_PERTURBATION__BEHAVIOR=forgetful_disorganized -# -# Behavior-specific ElevenLabs agent IDs. Pattern: EVA_{BEHAVIOR}_USER_{F|M} -# Only the pair matching EVA_PERTURBATION__BEHAVIOR needs to be set. -# EVA_AGGRESSIVE_IMPATIENT_USER_F= -# EVA_AGGRESSIVE_IMPATIENT_USER_M= -# EVA_ELDERLY_SLOW_USER_F= -# EVA_ELDERLY_SLOW_USER_M= -# EVA_FORGETFUL_DISORGANIZED_USER_F= -# EVA_FORGETFUL_DISORGANIZED_USER_M= +# ============================================== +# Debug & Logging +# ============================================== + +#i Run only 1 record regardless of dataset size. +#d bool +#v EVA_DEBUG=false + +#i Comma-separated record IDs to run. Empty = run all. +#d csv_list +#v EVA_RECORD_IDS= + +#i Logging verbosity. +#d enum +#e DEBUG,INFO,WARNING,ERROR,CRITICAL +#v EVA_LOG_LEVEL=INFO diff --git a/README.md b/README.md index 221486b3..a9d5edd1 100644 --- a/README.md +++ b/README.md @@ -153,6 +153,16 @@ eva \ --metrics task_completion,faithfulness,conciseness ``` +### Configuring EVA + +EVA includes a Streamlit config editor for building your `.env` file interactively: + +```bash +streamlit run apps/config_editor.py +``` + +The editor covers all variables grouped by tab (API keys, voice pipeline, model deployments, runtime settings, perturbations, etc.), with proper widgets for each type. See [`apps/README.md`](apps/README.md) for details. + ### Exploring Results EVA includes a Streamlit analysis app for visualizing and comparing results: diff --git a/apps/README.md b/apps/README.md index 5ed18405..69a524c2 100644 --- a/apps/README.md +++ b/apps/README.md @@ -1,6 +1,20 @@ # EVA Apps -Streamlit applications for exploring EVA results. +Streamlit applications for exploring and configuring EVA. + +## Config Editor + +Interactive UI for building and editing `.env` configuration files without hand-editing JSON or looking up variable names. + +### Usage + +```bash +streamlit run apps/config_editor.py +``` + +The app reads `.env.example` for the full variable set and loads existing values from `.env` if present. Each variable's widget type, enum options, ranges, and tooltips are declared directly in `.env.example` using annotation prefixes (`#i`, `#d`, `#e`, `#r`, `#x`, `#v`). Use the **Preview** button to inspect the generated file before saving, or **Download** to export it without writing to disk. + +--- ## Analysis App diff --git a/apps/analysis.py b/apps/analysis.py index 121b8d11..101fdcdc 100644 --- a/apps/analysis.py +++ b/apps/analysis.py @@ -1585,7 +1585,7 @@ def _pivot_metric_label(m: str) -> str: height=max(350, 80 + 40 * len(y_labels)), margin={"l": 20, "r": 20, "t": 50, "b": 120}, ) - st.plotly_chart(heatmap_fig, use_container_width=True) + st.plotly_chart(heatmap_fig, width="stretch") def render_run_overview(run_dir: Path): diff --git a/apps/config_editor.py b/apps/config_editor.py new file mode 100644 index 00000000..f59995b4 --- /dev/null +++ b/apps/config_editor.py @@ -0,0 +1,655 @@ +"""Streamlit app for editing EVA's .env file with a friendly UI. + +Run with: + + streamlit run apps/config_editor.py + +The app reads .env.example to discover variables and their metadata +(widget type, options, ranges, tooltips, conditions) from annotation +prefixes (#i, #d, #e, #r, #g, #x, #v). .env is read on startup to +prefill values and written on save. +""" + +from __future__ import annotations + +import html as html_module +import json +import sys +from pathlib import Path +from typing import Any + +import streamlit as st +import streamlit.components.v1 as st_components +from config_io import ( + AnnotatedVar, + ParsedEnvExample, + compute_disabled, + load_env, + parse_env_example, + serialize_env, +) +from config_schema import ( + GROUP_API_CONFIGS, + GROUP_DEPLOYMENTS, + GROUP_MISC, + GROUP_PERTURBATIONS, + GROUP_RUNTIME, + GROUPS, + MUTEX_RADIOS, +) + +REPO_ROOT = Path(__file__).resolve().parents[1] +sys.path.insert(0, str(REPO_ROOT)) + +ENV_EXAMPLE_PATH = REPO_ROOT / ".env.example" +ENV_PATH = REPO_ROOT / ".env" + + +# --------------------------------------------------------------------------- +# Initialisation +# --------------------------------------------------------------------------- + + +def _coerce(widget: str, raw: str) -> Any: + if not raw: + return _empty_for(widget) + raw = raw.strip() + try: + if widget == "bool": + return raw.lower() in ("true", "1", "yes", "on") + if widget == "int": + return int(raw) + if widget == "float": + return float(raw) + if widget == "csv_list": + return [x.strip() for x in raw.split(",") if x.strip()] + if widget in ("json_object", "json_deployment_list"): + s = raw.strip() + if s.startswith("'") and s.endswith("'"): + s = s[1:-1] + return json.loads(s) + except Exception: + return _empty_for(widget) + return raw + + +def _empty_for(widget: str) -> Any: + if widget == "bool": + return False + if widget in ("int", "float"): + return None + if widget in ("csv_list", "json_deployment_list"): + return [] + if widget == "json_object": + return {} + return "" + + +def _detect_pipeline_mode(env: dict[str, str]) -> str: + if env.get("EVA_MODEL__S2S"): + return "S2S" + if env.get("EVA_MODEL__AUDIO_LLM"): + return "AudioLLM" + return "LLM" + + +def _detect_perturbation_mode(env: dict[str, str]) -> str: + if env.get("EVA_PERTURBATION__ACCENT"): + return "Accent" + if env.get("EVA_PERTURBATION__BEHAVIOR"): + return "Behavior" + return "None" + + +def _init_state() -> None: + if "initialized" in st.session_state: + return + parsed = parse_env_example(ENV_EXAMPLE_PATH) + st.session_state.parsed = parsed + existing = load_env(ENV_PATH) + values: dict[str, Any] = {} + for var in parsed.vars: + raw = existing.get(var.name) + if raw is None and var.is_active: + raw = var.example_value.strip().strip("'\"") + values[var.name] = _coerce(var.widget, raw or "") + for name, raw in existing.items(): + if name not in {v.name for v in parsed.vars}: + values[name] = raw + st.session_state.field_values = values + st.session_state.loaded_keys = set(existing.keys()) + st.session_state.pipeline_mode = _detect_pipeline_mode(existing) + st.session_state.perturbation_mode = _detect_perturbation_mode(existing) + # Initialise all mutex radio states + for mx in MUTEX_RADIOS: + if mx.state_key not in st.session_state: + st.session_state[mx.state_key] = st.session_state.get(mx.state_key, mx.default) + st.session_state.initialized = True + + +# --------------------------------------------------------------------------- +# Visibility +# --------------------------------------------------------------------------- + + +def _is_visible_av(var: AnnotatedVar) -> bool: + """Return True when all #x conditions for this var are satisfied.""" + for cond_key, cond_val in var.conditions: + actual = st.session_state.get(cond_key) + if actual is None: + actual = st.session_state.get("field_values", {}).get(cond_key) + if actual != cond_val: + return False + return True + + +# --------------------------------------------------------------------------- +# Widget renderers +# --------------------------------------------------------------------------- + + +def _render_annotated_var(var: AnnotatedVar) -> None: + if not _is_visible_av(var): + return + values = st.session_state.field_values + current = values.get(var.name) + help_text = var.info or None + + if var.widget in ("string", "path"): + values[var.name] = st.text_input(var.name, value=current or "", help=help_text, key=f"w_{var.name}") + elif var.widget == "secret": + values[var.name] = st.text_input( + var.name, value=current or "", help=help_text, type="password", key=f"w_{var.name}" + ) + elif var.widget == "bool": + values[var.name] = st.checkbox(var.name, value=bool(current), help=help_text, key=f"w_{var.name}") + elif var.widget == "int": + rng = var.range + v = current if isinstance(current, int) else (int(rng[0]) if rng else 0) + values[var.name] = st.number_input( + var.name, + value=v, + min_value=int(rng[0]) if rng else None, + max_value=int(rng[1]) if rng and len(rng) > 1 else None, + step=int(rng[2]) if rng and len(rng) > 2 else 1, + help=help_text, + key=f"w_{var.name}", + ) + elif var.widget == "float": + rng = var.range + v = float(current) if isinstance(current, (int, float)) else (float(rng[0]) if rng else 0.0) + values[var.name] = st.number_input( + var.name, + value=float(v), + min_value=float(rng[0]) if rng else None, + max_value=float(rng[1]) if rng and len(rng) > 1 else None, + step=float(rng[2]) if rng and len(rng) > 2 else 0.1, + help=help_text, + key=f"w_{var.name}", + ) + elif var.widget == "enum": + options = _enum_options_for(var) + display = ["(unset)"] + options + idx = (options.index(current) + 1) if current in options else 0 + choice = st.selectbox(var.name, display, index=idx, help=help_text, key=f"w_{var.name}") + values[var.name] = "" if choice == "(unset)" else choice + elif var.widget == "multi_enum": + choices = current if isinstance(current, list) else [] + values[var.name] = st.multiselect(var.name, var.options, default=choices, help=help_text, key=f"w_{var.name}") + elif var.widget == "csv_list": + as_text = ",".join(current) if isinstance(current, list) else (current or "") + text = st.text_input(var.name, value=as_text, help=help_text, key=f"w_{var.name}") + values[var.name] = [x.strip() for x in text.split(",") if x.strip()] + elif var.widget == "json_object": + _render_json_object(var.name, var.info, current or {}) + elif var.widget == "json_deployment_list": + _render_deployment_list(var.name, var.info, current or []) + + +def _enum_options_for(var: AnnotatedVar) -> list[str]: + if var.name == "EVA_MODEL__LLM": + deployments = st.session_state.field_values.get("EVA_MODEL_LIST") or [] + return sorted({d.get("model_name", "") for d in deployments if isinstance(d, dict)} - {""}) + return var.options + + +def _render_json_object(name: str, info: str, current: dict) -> None: + st.markdown(f"**{name}**" + (f" — {info}" if info else "")) + raw_key = f"raw_{name}" + if raw_key not in st.session_state: + st.session_state[raw_key] = json.dumps(current, indent=2) if current else "" + + rows = [{"key": k, "value": _scalar_to_str(v)} for k, v in current.items()] or [{"key": "", "value": ""}] + edited = st.data_editor( + rows, + num_rows="dynamic", + width="stretch", + column_config={ + "key": st.column_config.TextColumn("key", required=False), + "value": st.column_config.TextColumn("value", required=False), + }, + key=f"de_{name}", + ) + parsed_kv: dict[str, Any] = {} + for row in edited: + k = (row.get("key") or "").strip() + if k: + parsed_kv[k] = _str_to_scalar(row.get("value")) + + with st.expander("Raw JSON", expanded=False): + text = st.text_area( + "Edit as JSON", value=json.dumps(parsed_kv, indent=2) if parsed_kv else "", key=raw_key, height=140 + ) + if text.strip(): + try: + parsed_kv = json.loads(text) + except json.JSONDecodeError as e: + st.warning(f"Invalid JSON: {e}") + + st.session_state.field_values[name] = parsed_kv + + +def _scalar_to_str(v: Any) -> str: + if isinstance(v, (dict, list)): + return json.dumps(v) + if isinstance(v, bool): + return "true" if v else "false" + if v is None: + return "" + return str(v) + + +def _str_to_scalar(v: Any) -> Any: + if not isinstance(v, str): + return v + s = v.strip() + if s == "": + return "" + if s.lower() in ("true", "false"): + return s.lower() == "true" + if s.startswith(("{", "[")): + try: + return json.loads(s) + except json.JSONDecodeError: + return s + try: + return int(s) if "." not in s else float(s) + except ValueError: + return s + + +def _render_deployment_list(name: str, info: str, current: list) -> None: + st.markdown(f"**{name}**" + (f" — {info}" if info else "")) + deployments: list[dict] = [d for d in current if isinstance(d, dict)] + + st.caption("All deployments — add / remove rows here, then select one below to edit its params.") + summary_rows = [ + {"model_name": d.get("model_name", ""), "provider/model": (d.get("litellm_params") or {}).get("model", "")} + for d in deployments + ] or [{"model_name": "", "provider/model": ""}] + + edited_summary = st.data_editor( + summary_rows, + num_rows="dynamic", + width="stretch", + column_config={ + "model_name": st.column_config.TextColumn("model_name (alias)", required=False), + "provider/model": st.column_config.TextColumn("provider/model (litellm_params.model)", required=False), + }, + key=f"de_summary_{name}", + ) + + old_by_name = {d.get("model_name", ""): d for d in deployments} + merged: list[dict] = [] + for idx, row in enumerate(edited_summary): + rname = (row.get("model_name") or "").strip() + if not rname: + continue + base = dict( + old_by_name.get(rname) or old_by_name.get(list(old_by_name)[idx] if idx < len(old_by_name) else "") or {} + ) + base["model_name"] = rname + lp = dict(base.get("litellm_params") or {}) + pm = (row.get("provider/model") or "").strip() + if pm: + lp["model"] = pm + base["litellm_params"] = lp + merged.append(base) + deployments = merged + + model_names = [d.get("model_name", "") for d in deployments if d.get("model_name")] + if not model_names: + st.session_state.field_values[name] = deployments + return + + sel_key = f"_depl_sel_{name}" + prev = st.session_state.get(sel_key) + default_idx = model_names.index(prev) if prev in model_names else 0 + selected = st.selectbox("Edit deployment", options=model_names, index=default_idx, key=sel_key) + + sel_idx = model_names.index(selected) + depl = deployments[sel_idx] + + st.markdown("**litellm_params**") + lp = depl.get("litellm_params") or {} + lp_rows = [{"key": k, "value": _scalar_to_str(v)} for k, v in lp.items()] or [{"key": "", "value": ""}] + edited_lp = st.data_editor( + lp_rows, + num_rows="dynamic", + width="stretch", + column_config={ + "key": st.column_config.TextColumn("key", required=False), + "value": st.column_config.TextColumn("value", required=False), + }, + key=f"de_lp_{name}_{selected}", + ) + new_lp: dict[str, Any] = { + (r.get("key") or "").strip(): _str_to_scalar(r.get("value")) for r in edited_lp if (r.get("key") or "").strip() + } + depl["litellm_params"] = new_lp + + extra_fields = {k: v for k, v in depl.items() if k not in ("model_name", "litellm_params")} + if extra_fields or st.checkbox("Add extra top-level fields", key=f"_extra_chk_{name}_{selected}"): + st.markdown("**Extra top-level fields** (e.g. `model_info`)") + extra_rows = [{"key": k, "value": _scalar_to_str(v)} for k, v in extra_fields.items()] or [ + {"key": "", "value": ""} + ] + edited_extra = st.data_editor( + extra_rows, + num_rows="dynamic", + width="stretch", + column_config={ + "key": st.column_config.TextColumn("key", required=False), + "value": st.column_config.TextColumn("value", required=False), + }, + key=f"de_extra_{name}_{selected}", + ) + edited_extra_keys = {(r.get("key") or "").strip() for r in edited_extra if (r.get("key") or "").strip()} + for k in list(extra_fields): + if k not in edited_extra_keys: + depl.pop(k, None) + for r in edited_extra: + k = (r.get("key") or "").strip() + if k: + depl[k] = _str_to_scalar(r.get("value")) + + deployments[sel_idx] = depl + st.session_state.field_values[name] = deployments + + +# --------------------------------------------------------------------------- +# Group / tab rendering +# --------------------------------------------------------------------------- + + +def _auto_group_for(name: str) -> str | None: + """Auto-assign group for vars not in the parsed template.""" + if "KEY" in name or "URL" in name: + return GROUP_API_CONFIGS + if "_USER" in name and "DEFAULT" not in name: + return GROUP_PERTURBATIONS + if name.startswith("EVA_"): + return GROUP_RUNTIME + return None + + +def _render_unmapped_var(name: str) -> None: + values = st.session_state.field_values + v = values.get(name, "") + if not isinstance(v, str): + v = json.dumps(v) if v else "" + widget_type = "password" if "KEY" in name else "default" + values[name] = st.text_input(name, value=v, key=f"w_{name}", type=widget_type) + + +def _render_add_var_widget(context: str) -> None: + st.divider() + st.markdown("**Add a new variable**") + counter_key = f"_add_var_counter_{context}" + if counter_key not in st.session_state: + st.session_state[counter_key] = 0 + input_key = f"_add_var_input_{context}_{st.session_state[counter_key]}" + col_input, col_btn = st.columns([4, 1]) + with col_input: + new_name = st.text_input( + "Variable name", + key=input_key, + label_visibility="collapsed", + placeholder="e.g. MY_API_KEY", + ) + with col_btn: + if st.button("Add", key=f"_add_var_btn_{context}", width="stretch"): + name = new_name.strip().upper() + if not name: + st.warning("Please enter a variable name.") + elif name.startswith("EVA_"): + st.error( + "`EVA_*` variables are managed via `apps/config_schema.py`. Add it there to get a proper widget." + ) + elif name in st.session_state.field_values or name in {v.name for v in st.session_state.parsed.vars}: + st.warning(f"`{name}` already exists.") + else: + st.session_state.field_values[name] = "" + st.session_state[counter_key] += 1 + st.rerun() + st.caption( + "Variables containing **KEY** or **URL** are placed under *API Configs*. " + "Everything else stays here. `EVA_*` variables cannot be added here — " + "add them to `.env.example` instead." + ) + + +def _render_group(group: str) -> None: + parsed: ParsedEnvExample = st.session_state.parsed + + # Render mutex radio buttons for this group + for mx in MUTEX_RADIOS: + if mx.group == group: + options = mx.options + current = st.session_state.get(mx.state_key, mx.default) + idx = options.index(current) if current in options else 0 + st.session_state[mx.state_key] = st.radio( + mx.label, + options=options, + index=idx, + horizontal=True, + help=mx.help, + key=f"radio_{mx.state_key}", + ) + st.divider() + + # Template vars for this group + group_vars = [v for v in parsed.vars if v.group == group] + + # Auto-routed unmapped vars (from loaded .env, not in template) + all_known = set(parsed.by_name) + auto_names = [n for n in st.session_state.field_values if n not in all_known and _auto_group_for(n) == group] + + if group == GROUP_API_CONFIGS: + # Sort alphabetically so KEYs and URLs cluster + schema_map = {v.name: v for v in group_vars} + for name in sorted(set(schema_map) | set(auto_names)): + if name in schema_map: + _render_annotated_var(schema_map[name]) + else: + _render_unmapped_var(name) + _render_add_var_widget("api") + else: + for var in group_vars: + _render_annotated_var(var) + for name in auto_names: + _render_unmapped_var(name) + + # Cross-field validation for deployments tab + if group == GROUP_DEPLOYMENTS: + deployments = st.session_state.field_values.get("EVA_MODEL_LIST") or [] + chosen = st.session_state.field_values.get("EVA_MODEL__LLM") + names = {d.get("model_name") for d in deployments if isinstance(d, dict)} + if chosen and chosen not in names: + st.error( + f"EVA_MODEL__LLM = `{chosen}` does not match any deployment in EVA_MODEL_LIST. " + "Add it above or pick a different alias." + ) + + +def _render_misc_tab(parsed: ParsedEnvExample) -> None: + known = set(parsed.by_name) + truly_misc = [n for n in st.session_state.field_values if n not in known and _auto_group_for(n) is None] + # Also add template vars with no group assignment + for var in parsed.vars: + if var.group is None and var.name not in list(truly_misc): + truly_misc.append(var.name) + + if not truly_misc: + st.info("No unmapped variables. 🎉") + else: + st.warning( + f"Found {len(truly_misc)} variable(s) not covered by the template. " + "Add them to `.env.example` for proper widgets." + ) + for name in truly_misc: + _render_unmapped_var(name) + _render_add_var_widget("misc") + + +# --------------------------------------------------------------------------- +# Serialization +# --------------------------------------------------------------------------- + + +def _is_meaningful(name: str, value: Any) -> bool: + if name in st.session_state.get("loaded_keys", set()): + return True + if value is None: + return False + if isinstance(value, str) and value == "": + return False + if isinstance(value, bool) and value is False: + return False + if isinstance(value, (list, dict)) and len(value) == 0: + return False + if isinstance(value, (int, float)) and value == 0: + return False + return True + + +def _build_serialized() -> str: + values = {k: v for k, v in st.session_state.field_values.items() if _is_meaningful(k, v)} + parsed: ParsedEnvExample = st.session_state.parsed + known = set(parsed.by_name) + # csv_list → comma-separated string for serializer + for var in parsed.vars: + if var.widget == "csv_list" and isinstance(values.get(var.name), list): + values[var.name] = ",".join(values[var.name]) + # Collect current mode state for condition evaluation + mode_state: dict[str, str] = {} + for mx in MUTEX_RADIOS: + mode_state[mx.state_key] = st.session_state.get(mx.state_key, mx.default) + mode_state.update({k: str(v) for k, v in values.items() if isinstance(v, str)}) + disabled = compute_disabled(parsed, **mode_state) + # Split extras by auto-routing: inline into their parent section or fall through to Misc + extras = {k: v for k, v in values.items() if k not in known} + api_extras = {k: v for k, v in extras.items() if _auto_group_for(k) == GROUP_API_CONFIGS} + runtime_extras = {k: v for k, v in extras.items() if _auto_group_for(k) == GROUP_RUNTIME} + section_extras: dict[str, dict] = {} + if api_extras: + section_extras[GROUP_API_CONFIGS] = dict(sorted(api_extras.items())) + if runtime_extras: + section_extras[GROUP_RUNTIME] = dict(sorted(runtime_extras.items())) + # vars with no auto-route → auto-collected into Misc by serialize_env + serializer_values = {k: v for k, v in values.items() if k in known or _auto_group_for(k) is None} + return serialize_env(serializer_values, parsed, disabled=disabled, section_extras=section_extras or None) + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- + + +def main() -> None: + st.set_page_config(page_title="EVA Config Editor", layout="wide", page_icon="⚙️") + _init_state() + + st.markdown( + """ + + """, + unsafe_allow_html=True, + ) + + st.title("EVA Config Editor") + if not ENV_PATH.exists(): + st.info( + f"No `.env` file found at `{ENV_PATH.relative_to(REPO_ROOT)}`. " + "Fill in your values below and click **Save to .env** to create it.", + icon="ℹ️", + ) + st.caption( + f"Reading variable set from `{ENV_EXAMPLE_PATH.relative_to(REPO_ROOT)}`. " + + ( + f"Loaded existing values from `{ENV_PATH.relative_to(REPO_ROOT)}`." + if ENV_PATH.exists() + else "Defaults seeded from `.env.example`." + ) + ) + + left, right = st.columns([2, 1], gap="large") + + with left: + tabs = st.tabs(GROUPS + [GROUP_MISC]) + for tab, group in zip(tabs[:-1], GROUPS): + with tab: + _render_group(group) + with tabs[-1]: + _render_misc_tab(st.session_state.parsed) + + with right: + st.subheader("Preview & Save") + text = _build_serialized() + st.download_button( + "⬇️ Download .env", + data=text, + file_name=".env", + mime="text/plain", + width="stretch", + ) + data_attr = html_module.escape(json.dumps(text), quote=True) + st_components.html( + f""" + + """, + height=42, + ) + if st.button("💾 Save to .env", width="stretch", type="primary"): + ENV_PATH.write_text(text) + st.success(f"Wrote {ENV_PATH}") + if st.button("👁️ View preview", width="stretch"): + _show_preview(text) + + +@st.dialog("Preview .env", width="large") +def _show_preview(text: str) -> None: + st.code(text, language="ini") + + +if __name__ == "__main__": + main() diff --git a/apps/config_io.py b/apps/config_io.py new file mode 100644 index 00000000..dbf53f0b --- /dev/null +++ b/apps/config_io.py @@ -0,0 +1,391 @@ +"""Parsing and serialization helpers for the EVA config editor. + +Annotation prefix scheme for .env.example: + + # True comment — ignored by editor, preserved verbatim. + #i Info/tooltip text for the following variable. + #d Widget datatype: secret|bool|int|float|string|path|enum| + multi_enum|csv_list|json_object|json_deployment_list + #e Comma-separated enum options for enum/multi_enum. + #r Numeric range: min,max or min,max,step + #g Override tab/group assignment for this variable. + #x Visibility condition VAR=value (AND semantics; multiple lines ok). + #v Inactive variable definition (off by default, fully configurable). +""" + +from __future__ import annotations + +import json +import re +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any + +_NAME_RE = re.compile(r"^[A-Za-z_][A-Za-z0-9_]*$") + + +@dataclass +class AnnotatedVar: + name: str + is_active: bool # False = declared with #v + example_value: str # raw default from file + widget: str # from #d or inferred + info: str # from #i lines (joined) + options: list[str] # from #e + range: tuple[float, ...] | None # (min, max[, step]) from #r + group: str | None # from #g or section header + conditions: list[tuple[str, str]] # from #x lines (AND semantics) + line_start: int + line_end: int + + +@dataclass +class ParsedEnvExample: + lines: list[str] + vars: list[AnnotatedVar] + by_name: dict[str, AnnotatedVar] = field(default_factory=dict) + + def __post_init__(self) -> None: + if not self.by_name: + self.by_name = {v.name: v for v in self.vars} + + # ── back-compat shim so old tests still compile ─────────────────────── + @property + def specs(self) -> list[AnnotatedVar]: + return self.vars + + +def _is_section_rule(line: str) -> bool: + s = line.strip() + return bool(re.match(r"^\s*#\s*={3,}\s*$", s)) + + +def _consume_quoted_continuation(lines: list[str], start_idx: int, value_head: str) -> int: + """If value_head opens an unterminated single/double-quoted string, scan forward.""" + stripped = value_head.strip() + if not stripped: + return start_idx + quote = stripped[0] + if quote not in ("'", '"'): + return start_idx + rest = stripped[1:] + if quote in rest: + return start_idx + for j in range(start_idx + 1, len(lines)): + if quote in lines[j]: + return j + return len(lines) - 1 + + +def _infer_widget(name: str, value: str) -> str: + """Best-effort widget type from variable name and example value.""" + n = name.upper() + v = value.strip().lower() + if any(x in n for x in ("KEY", "SECRET", "TOKEN", "PASSWORD")): + return "secret" + if "CREDENTIALS" in n or n.endswith("_PATH") or n.endswith("_DIR"): + return "path" + if v in ("true", "false"): + return "bool" + raw = v.strip("'\"") + if raw.startswith("["): + return "json_deployment_list" if "model_name" in raw else "json_object" + if raw.startswith("{"): + return "json_object" + try: + int(raw) + return "int" + except ValueError: + pass + try: + float(raw) + return "float" + except ValueError: + pass + return "string" + + +def parse_env_example(path: str | Path) -> ParsedEnvExample: + """Parse a .env.example file that uses the annotation prefix scheme.""" + text = Path(path).read_text() + raw_lines = text.splitlines(keepends=False) + + vars_list: list[AnnotatedVar] = [] + seen: set[str] = set() + current_section: str | None = None + + ann_info: list[str] = [] + ann_widget: str | None = None + ann_options: list[str] = [] + ann_range: tuple[float, ...] | None = None + ann_group: str | None = None + ann_conditions: list[tuple[str, str]] = [] + + def reset_ann() -> None: + nonlocal ann_info, ann_widget, ann_options, ann_range, ann_group, ann_conditions + ann_info = [] + ann_widget = None + ann_options = [] + ann_range = None + ann_group = None + ann_conditions = [] + + def emit_var(name: str, is_active: bool, value_head: str, line_start: int) -> int: + end_idx = _consume_quoted_continuation(raw_lines, line_start, value_head) + raw_value = ( + "\n".join([value_head, *raw_lines[line_start + 1 : end_idx + 1]]) if end_idx > line_start else value_head + ) + widget = ann_widget or _infer_widget(name, raw_value) + vars_list.append( + AnnotatedVar( + name=name, + is_active=is_active, + example_value=raw_value, + widget=widget, + info=" ".join(ann_info), + options=list(ann_options), + range=ann_range, + group=ann_group or current_section, + conditions=list(ann_conditions), + line_start=line_start, + line_end=end_idx, + ) + ) + seen.add(name) + reset_ann() + return end_idx + + i = 0 + while i < len(raw_lines): + line = raw_lines[i] + stripped = line.strip() + + # Section header block (# ===...=== / # Title / # ===...===) + if _is_section_rule(line): + if i + 1 < len(raw_lines): + inner = raw_lines[i + 1].lstrip("#").strip() + if inner and not _is_section_rule(raw_lines[i + 1]): + current_section = inner + reset_ann() + j = i + 1 + while j < len(raw_lines) and not _is_section_rule(raw_lines[j]): + j += 1 + i = j + 1 if j < len(raw_lines) else j + continue + + # Annotation lines — accumulate until next variable or reset + if stripped.startswith("#i "): + ann_info.append(stripped[3:].strip()) + i += 1 + continue + if stripped.startswith("#d "): + ann_widget = stripped[3:].strip() + i += 1 + continue + if stripped.startswith("#e "): + ann_options = [o.strip() for o in stripped[3:].split(",") if o.strip()] + i += 1 + continue + if stripped.startswith("#r "): + parts = [p.strip() for p in stripped[3:].split(",")] + try: + ann_range = tuple(float(p) for p in parts[:3]) # type: ignore[assignment] + except ValueError: + pass + i += 1 + continue + if stripped.startswith("#g "): + ann_group = stripped[3:].strip() + i += 1 + continue + if stripped.startswith("#x "): + cond = stripped[3:].strip() + if "=" in cond: + k, _, v = cond.partition("=") + ann_conditions.append((k.strip(), v.strip())) + i += 1 + continue + + # Inactive variable: #v NAME=value + if stripped.startswith("#v "): + rest = stripped[3:].strip() + if "=" in rest: + name, _, value_head = rest.partition("=") + name = name.strip() + if _NAME_RE.match(name) and name not in seen: + end_idx = emit_var(name, False, value_head, i) + i = end_idx + 1 + continue + reset_ann() + i += 1 + continue + + # Active variable: NAME=value (no leading #) + if not stripped.startswith("#") and "=" in stripped: + name, _, value_head = stripped.partition("=") + name = name.strip() + if _NAME_RE.match(name) and name not in seen: + end_idx = emit_var(name, True, value_head, i) + i = end_idx + 1 + continue + + # True comment or blank — reset annotation accumulator + reset_ann() + i += 1 + + return ParsedEnvExample(lines=raw_lines, vars=vars_list) + + +def load_env(path: str | Path) -> dict[str, str]: + """Read an existing .env into a flat {NAME: value} dict. + + Commented-out lines (including #v lines) are skipped. + Values have surrounding quotes stripped. + """ + p = Path(path) + if not p.exists(): + return {} + out: dict[str, str] = {} + i = 0 + lines = p.read_text().splitlines(keepends=False) + while i < len(lines): + line = lines[i] + stripped = line.strip() + if stripped.startswith("#") or not stripped: + i += 1 + continue + if "=" in stripped: + name, _, value_head = stripped.partition("=") + name = name.strip() + if _NAME_RE.match(name): + end_idx = _consume_quoted_continuation(lines, i, value_head) + raw = "\n".join([value_head, *lines[i + 1 : end_idx + 1]]) if end_idx > i else value_head + out[name] = _unquote(raw.strip()) + i = end_idx + 1 + continue + i += 1 + return out + + +def _unquote(value: str) -> str: + if len(value) >= 2 and value[0] == value[-1] and value[0] in ("'", '"'): + return value[1:-1] + return value + + +def _format_value(value: Any) -> str: + if isinstance(value, bool): + return "true" if value else "false" + if isinstance(value, (int, float)): + return str(value) + if isinstance(value, (dict, list)): + return f"'{json.dumps(value)}'" + s = str(value) + if not s: + return "" + if any(c in s for c in (" ", "\t", "#", "'", '"', "$", "\n")): + if "'" not in s: + return f"'{s}'" + return json.dumps(s) + return s + + +def _has_value(v: Any) -> bool: + if v is None: + return False + if isinstance(v, str) and v == "": + return False + if isinstance(v, (list, dict)) and len(v) == 0: + return False + return True + + +def serialize_env( + values: dict[str, Any], + parsed: ParsedEnvExample, + disabled: set[str] | None = None, + section_extras: dict[str, dict[str, Any]] | None = None, +) -> str: + """Produce a .env text using parsed as the structural template. + + - Variables in values with a user-set entry → emitted as NAME=value (active). + - Variables in disabled with a value → emitted as #v NAME=value (inactive, value preserved). + - Everything else → original line(s) from the template verbatim. + - section_extras: {section_title: {name: value}} injected inline at the end of each + named section (just before the next section header starts). + - Any values not in the template and not in section_extras are auto-appended as Misc. + """ + disabled = disabled or set() + section_extras = section_extras or {} + out: list[str] = [] + handled: set[str] = set() + var_by_start = {v.line_start: v for v in parsed.vars} + current_section: str | None = None + + def _flush_extras(section: str | None) -> None: + if not section or section not in section_extras: + return + for name, val in section_extras[section].items(): + if _has_value(val): + out.append(f"{name}={_format_value(val)}") + + i = 0 + while i < len(parsed.lines): + line = parsed.lines[i] + + # Detect the opening rule of a new section (rule whose next line is the title) + if _is_section_rule(line): + next_line = parsed.lines[i + 1] if i + 1 < len(parsed.lines) else "" + next_content = next_line.lstrip("#").strip() + if next_content and not _is_section_rule(next_line): + # Flush extras for the section we're leaving before writing the new header + _flush_extras(current_section) + current_section = next_content + + if i in var_by_start: + var = var_by_start[i] + user_value = values.get(var.name) + if var.name in disabled: + if _has_value(user_value): + out.append(f"#v {var.name}={_format_value(user_value)}") + else: + out.append(f"#v {var.name}={var.example_value.strip()}") + elif _has_value(user_value): + out.append(f"{var.name}={_format_value(user_value)}") + else: + out.extend(parsed.lines[var.line_start : var.line_end + 1]) + handled.add(var.name) + i = var.line_end + 1 + continue + + out.append(line) + i += 1 + + # Flush extras for the final section + _flush_extras(current_section) + + # Auto-collect any values not in the template into a Misc section + extras = [name for name in values if name not in handled and _has_value(values[name])] + if extras: + out.append("") + out.append("# ==============================================") + out.append("# Misc / Unmapped (added by config editor)") + out.append("# ==============================================") + for name in extras: + out.append(f"{name}={_format_value(values[name])}") + + return "\n".join(out) + "\n" + + +def compute_disabled(parsed: ParsedEnvExample, **state_values: str) -> set[str]: + """Return names of vars whose #x conditions are not all satisfied. + + Pass mode keys as kwargs, e.g. compute_disabled(parsed, pipeline_mode="LLM"). + """ + disabled: set[str] = set() + for var in parsed.vars: + for cond_key, cond_val in var.conditions: + if state_values.get(cond_key, "") != cond_val: + disabled.add(var.name) + break + return disabled diff --git a/apps/config_schema.py b/apps/config_schema.py new file mode 100644 index 00000000..1d0ce172 --- /dev/null +++ b/apps/config_schema.py @@ -0,0 +1,69 @@ +"""Schema constants for the EVA config editor. + +Variable metadata (widget types, options, ranges, tooltips, conditions) is +now encoded directly in .env.example using annotation prefixes (#i, #d, #e, +#r, #g, #x, #v). This module retains only things that are inherently +editor-behaviour rather than file-structure: + +- Tab group name constants and ordering. +- Mutex radio-button definitions (pipeline mode, perturbation mode). +""" + +from __future__ import annotations + +from dataclasses import dataclass, field + +GROUP_API_CONFIGS = "API Configs" +GROUP_VOICE_PIPELINE = "Voice Pipeline" +GROUP_DEPLOYMENTS = "LiteLLM Deployments" +GROUP_RUNTIME = "Framework & Runtime" +GROUP_TURN = "Turn Detection & VAD" +GROUP_PERTURBATIONS = "User Config" +GROUP_DEBUG = "Debug & Logging" +GROUP_MISC = "Misc / Unmapped" + +GROUPS: list[str] = [ + GROUP_API_CONFIGS, + GROUP_VOICE_PIPELINE, + GROUP_DEPLOYMENTS, + GROUP_RUNTIME, + GROUP_TURN, + GROUP_PERTURBATIONS, + GROUP_DEBUG, +] + + +@dataclass +class MutexRadio: + """A UI radio button that enforces mutual exclusion among a set of vars.""" + + state_key: str # st.session_state key managed by this radio + group: str # which tab renders this radio + label: str + options: list[str] + help: str = "" + default: str = field(default="") + + def __post_init__(self) -> None: + if not self.default and self.options: + self.default = self.options[0] + + +MUTEX_RADIOS: list[MutexRadio] = [ + MutexRadio( + state_key="pipeline_mode", + group=GROUP_VOICE_PIPELINE, + label="Pipeline mode", + options=["LLM", "S2S", "AudioLLM"], + help="LLM = STT+LLM+TTS. S2S = speech-to-speech model. AudioLLM = audio-input LLM + TTS.", + default="LLM", + ), + MutexRadio( + state_key="perturbation_mode", + group=GROUP_PERTURBATIONS, + label="Perturbation persona", + options=["None", "Accent", "Behavior"], + help="Accent and Behavior are mutually exclusive (each claims the agent ID slot).", + default="None", + ), +] diff --git a/tests/unit/test_config_editor_integration.py b/tests/unit/test_config_editor_integration.py new file mode 100644 index 00000000..08968efe --- /dev/null +++ b/tests/unit/test_config_editor_integration.py @@ -0,0 +1,107 @@ +"""End-to-end check. + +A config built via the editor's serializer must construct a valid RunConfig +for each pipeline mode. +""" + +from __future__ import annotations + +import os +from pathlib import Path +from unittest.mock import patch + +from apps.config_io import compute_disabled, parse_env_example, serialize_env +from eva.models.config import RunConfig + +REPO_ROOT = Path(__file__).resolve().parents[2] + +_MODEL_LIST = [ + { + "model_name": "gpt-5.2", + "litellm_params": {"model": "openai/gpt-5.2", "api_key": "sk-test", "max_parallel_requests": 5}, + "model_info": {"base_model": "gpt-5.2"}, + }, + { + "model_name": "gemini-3.1-pro-preview", + "litellm_params": { + "model": "vertex_ai/gemini-3.1-pro-preview", + "vertex_project": "p", + "vertex_location": "global", + "vertex_credentials": "/tmp/x.json", + "max_parallel_requests": 5, + }, + }, + { + "model_name": "us.anthropic.claude-opus-4-6", + "litellm_params": { + "model": "bedrock/us.anthropic.claude-opus-4-6-v1", + "aws_access_key_id": "k", + "aws_secret_access_key": "s", + "max_parallel_requests": 5, + }, + }, +] + + +def _serialize(values: dict, parsed, pipeline_mode: str = "LLM", perturbation_mode: str = "None") -> str: + disabled = compute_disabled(parsed, pipeline_mode=pipeline_mode, perturbation_mode=perturbation_mode) + return serialize_env(values, parsed, disabled=disabled) + + +def _load_isolated(env_file: Path) -> RunConfig: + with patch.dict(os.environ, {"PATH": os.environ["PATH"]}, clear=True): + return RunConfig(_env_file=env_file, _cli_parse_args=False) + + +def test_llm_pipeline_serialization_constructs_runconfig(tmp_path: Path) -> None: + parsed = parse_env_example(REPO_ROOT / ".env.example") + values = { + "EVA_MODEL_LIST": _MODEL_LIST, + "EVA_MODEL__LLM": "gpt-5.2", + "EVA_MODEL__STT": "deepgram", + "EVA_MODEL__TTS": "cartesia", + "EVA_MODEL__STT_PARAMS": {"api_key": "k", "model": "nova-2"}, + "EVA_MODEL__TTS_PARAMS": {"api_key": "k", "model": "sonic"}, + "EVA_DOMAIN": "airline", + } + env_file = tmp_path / ".env" + env_file.write_text(_serialize(values, parsed)) + config = _load_isolated(env_file) + assert config.model.llm == "gpt-5.2" + assert config.model.stt == "deepgram" + assert config.model.tts == "cartesia" + assert config.domain == "airline" + + +def test_s2s_pipeline_serialization_constructs_runconfig(tmp_path: Path) -> None: + parsed = parse_env_example(REPO_ROOT / ".env.example") + values = { + "EVA_MODEL_LIST": _MODEL_LIST, + "EVA_MODEL__S2S": "gpt-realtime-mini", + "EVA_MODEL__S2S_PARAMS": {"api_key": "k", "model": "gpt-realtime-mini"}, + "EVA_DOMAIN": "airline", + } + env_file = tmp_path / ".env" + env_file.write_text(_serialize(values, parsed, pipeline_mode="S2S")) + config = _load_isolated(env_file) + assert config.model.s2s == "gpt-realtime-mini" + + +def test_perturbation_accent_serialization_constructs_runconfig(tmp_path: Path) -> None: + parsed = parse_env_example(REPO_ROOT / ".env.example") + values = { + "EVA_MODEL_LIST": _MODEL_LIST, + "EVA_MODEL__LLM": "gpt-5.2", + "EVA_MODEL__STT": "deepgram", + "EVA_MODEL__TTS": "cartesia", + "EVA_MODEL__STT_PARAMS": {"api_key": "k", "model": "nova-2"}, + "EVA_MODEL__TTS_PARAMS": {"api_key": "k", "model": "sonic"}, + "EVA_DOMAIN": "airline", + "EVA_PERTURBATION__ACCENT": "french", + } + env_file = tmp_path / ".env" + env_file.write_text(_serialize(values, parsed, perturbation_mode="Accent")) + config = _load_isolated(env_file) + assert config.perturbation is not None + assert config.perturbation.accent == "french" + assert config.perturbation.behavior is None diff --git a/tests/unit/test_config_io.py b/tests/unit/test_config_io.py new file mode 100644 index 00000000..77d0baed --- /dev/null +++ b/tests/unit/test_config_io.py @@ -0,0 +1,156 @@ +"""Unit tests for apps/config_io.py (annotation-aware env parser/serializer).""" + +from __future__ import annotations + +from pathlib import Path + +from apps.config_io import load_env, parse_env_example, serialize_env +from apps.config_schema import GROUP_MISC + +ENV_EXAMPLE = Path(__file__).resolve().parents[2] / ".env.example" + + +def test_parses_active_var() -> None: + parsed = parse_env_example(ENV_EXAMPLE) + spec = parsed.by_name["ELEVENLABS_API_KEY"] + assert spec.is_active is True + assert spec.widget == "secret" + assert "ElevenLabs" in spec.info + + +def test_parses_inactive_var() -> None: + parsed = parse_env_example(ENV_EXAMPLE) + spec = parsed.by_name["EVA_DOMAIN"] + assert spec.is_active is False + assert spec.widget == "enum" + assert "airline" in spec.options + + +def test_parses_enum_options() -> None: + parsed = parse_env_example(ENV_EXAMPLE) + spec = parsed.by_name["EVA_MODEL__STT"] + assert "deepgram" in spec.options + assert "cartesia" in spec.options + + +def test_parses_range() -> None: + parsed = parse_env_example(ENV_EXAMPLE) + spec = parsed.by_name["EVA_MAX_CONCURRENT_CONVERSATIONS"] + assert spec.range is not None + assert spec.range[0] == 1.0 + assert spec.range[1] == 100.0 + + +def test_parses_condition() -> None: + parsed = parse_env_example(ENV_EXAMPLE) + spec = parsed.by_name["EVA_MODEL__STT"] + assert ("pipeline_mode", "LLM") in spec.conditions + + +def test_parses_multi_condition() -> None: + parsed = parse_env_example(ENV_EXAMPLE) + spec = parsed.by_name["EVA_FRENCH_ACCENT_USER_F"] + assert ("perturbation_mode", "Accent") in spec.conditions + assert ("EVA_PERTURBATION__ACCENT", "french") in spec.conditions + + +def test_group_from_section_header() -> None: + parsed = parse_env_example(ENV_EXAMPLE) + assert parsed.by_name["ELEVENLABS_API_KEY"].group == "API Configs" + assert parsed.by_name["EVA_MODEL__LLM"].group == "LiteLLM Deployments" + + +def test_dedupes_repeated_names() -> None: + parsed = parse_env_example(ENV_EXAMPLE) + occurrences = [v for v in parsed.vars if v.name == "EVA_METRICS"] + assert len(occurrences) == 1 + + +def test_multiline_deployment_list() -> None: + parsed = parse_env_example(ENV_EXAMPLE) + spec = parsed.by_name["EVA_MODEL_LIST"] + assert spec.widget == "json_deployment_list" + assert spec.line_end > spec.line_start + + +def test_serialize_with_no_values_is_byte_identical_to_example() -> None: + parsed = parse_env_example(ENV_EXAMPLE) + rendered = serialize_env({}, parsed) + original = ENV_EXAMPLE.read_text() + if not original.endswith("\n"): + original += "\n" + assert rendered == original + + +def test_serialize_overrides_active_var(tmp_path: Path) -> None: + parsed = parse_env_example(ENV_EXAMPLE) + rendered = serialize_env({"OPENAI_API_KEY": "sk-test-123"}, parsed) + assert "OPENAI_API_KEY=sk-test-123" in rendered + assert "your_openai_api_key_here" not in rendered + + +def test_serialize_activates_inactive_var() -> None: + parsed = parse_env_example(ENV_EXAMPLE) + rendered = serialize_env({"EVA_DOMAIN": "airline"}, parsed) + lines = rendered.splitlines() + assert "EVA_DOMAIN=airline" in lines + assert "#v EVA_DOMAIN=airline" not in lines + + +def test_serialize_json_blob_single_quoted() -> None: + parsed = parse_env_example(ENV_EXAMPLE) + deployments = [{"model_name": "x", "litellm_params": {"model": "openai/x"}}] + rendered = serialize_env({"EVA_MODEL_LIST": deployments}, parsed) + assert "EVA_MODEL_LIST='" in rendered + + +def test_serialize_bool_lowercases() -> None: + parsed = parse_env_example(ENV_EXAMPLE) + rendered = serialize_env({"EVA_DEBUG": True}, parsed) + assert "EVA_DEBUG=true" in rendered + + +def test_serialize_appends_misc_section_for_unknown_vars() -> None: + parsed = parse_env_example(ENV_EXAMPLE) + rendered = serialize_env({"EVA_TOTALLY_NEW_VAR": "hello"}, parsed) + assert GROUP_MISC in rendered + assert "EVA_TOTALLY_NEW_VAR=hello" in rendered + + +def test_serialize_disabled_var_uses_current_value() -> None: + parsed = parse_env_example(ENV_EXAMPLE) + rendered = serialize_env( + {"EVA_MODEL__STT": "deepgram"}, + parsed, + disabled={"EVA_MODEL__STT"}, + ) + assert "#v EVA_MODEL__STT=deepgram" in rendered + + +def test_load_env_reads_existing_file(tmp_path: Path) -> None: + p = tmp_path / ".env" + p.write_text("FOO=bar\n#v COMMENTED=skipme\nQUOTED='hello world'\nJSON='[{\"a\": 1}]'\n") + out = load_env(p) + assert out == {"FOO": "bar", "QUOTED": "hello world", "JSON": '[{"a": 1}]'} + + +def test_load_env_missing_file_returns_empty(tmp_path: Path) -> None: + assert load_env(tmp_path / "does-not-exist") == {} + + +def test_round_trip_through_load_env(tmp_path: Path) -> None: + parsed = parse_env_example(ENV_EXAMPLE) + written = serialize_env( + { + "OPENAI_API_KEY": "sk-abc", + "EVA_DEBUG": True, + "EVA_MAX_CONCURRENT_CONVERSATIONS": 8, + }, + parsed, + ) + p = tmp_path / ".env" + p.write_text(written) + loaded = load_env(p) + assert loaded["OPENAI_API_KEY"] == "sk-abc" + assert loaded["EVA_DEBUG"] == "true" + assert loaded["EVA_MAX_CONCURRENT_CONVERSATIONS"] == "8"