diff --git a/.env.example b/.env.example
index ef8f6bc6..b089ed59 100644
--- a/.env.example
+++ b/.env.example
@@ -11,70 +11,114 @@
# python scripts/run_text_only.py --record-id 1.1.2
# ==============================================
-# Required: API Keys
+# API Configs
# ==============================================
-# --- ElevenLabs (user simulator) ---
+# --- ElevenLabs ---
+#i ElevenLabs API key for the user simulator.
+#d secret
ELEVENLABS_API_KEY=your_elevenlabs_api_key_here
-# ElevenLabs Conversational AI agent IDs for user simulation.
-# Create a Conversational AI agent at https://elevenlabs.io/conversational-ai and copy its agent ID.
-# You need two agents: one with a female voice (persona 1) and one with a male voice (persona 2).
-# These are used to simulate different caller personas during benchmark conversations.
+# --- LLM / Text Judge ---
+#i OpenAI key for assistant LLM and text judge metrics.
+#d secret
+OPENAI_API_KEY=your_openai_api_key_here
-EVA_DEFAULT_USER_F=your_elevenlabs_agent_id_for_default_user_f
-EVA_DEFAULT_USER_M=your_elevenlabs_agent_id_for_default_user_m
+# --- Audio Judge (Gemini via GCP) ---
+#i Path to GCP service-account JSON for Gemini audio judge metrics.
+#d path
+GOOGLE_APPLICATION_CREDENTIALS=path/to/your/service-account-credentials.json
-# --- LLM (assistant + text judge metrics) ---
-OPENAI_API_KEY=your_openai_api_key_here
+# --- Faithfulness Metric (Claude via Bedrock) ---
+#i AWS access key for Claude via Bedrock (faithfulness metric).
+#d secret
+AWS_ACCESS_KEY_ID=your_aws_access_key_id_here
+
+#i AWS secret access key.
+#d secret
+AWS_SECRET_ACCESS_KEY=your_aws_secret_access_key_here
+
+# --- Alternative providers (optional) ---
+# If you only have an OpenAI key you can skip AWS and set JUDGE_MODEL=gpt-5.2
+# to override all text judges. Audio judge metrics still require Gemini.
+
+#i Azure OpenAI key (alternative to direct OpenAI).
+#d secret
+#v AZURE_OPENAI_API_KEY=your_azure_openai_api_key_here
+
+#i Azure OpenAI endpoint URL.
+#d string
+#v AZURE_OPENAI_ENDPOINT=https://your-resource.openai.azure.com/
+
+#i Google API key (alternative to service-account credentials for Gemini).
+#d secret
+#v GOOGLE_API_KEY=your_google_api_key_here
+
+# ==============================================
+# Voice Pipeline
+# ==============================================
-# --- STT/TTS (voice pipeline) ---
-# The API key and model for your chosen provider must be passed via the *_PARAMS JSON.
+# Pipeline mode is controlled by the UI radio (LLM / S2S / AudioLLM).
+# The #x conditions below ensure each variable is only active for the right mode.
-# STT provider: assemblyai | cartesia | deepgram | deepgram-flux | elevenlabs | nvidia | nvidia-baseten | openai
+# --- LLM mode: STT ---
+#i STT provider for the voice pipeline.
+#d enum
+#e assemblyai,cartesia,deepgram,deepgram-flux,elevenlabs,nvidia,nvidia-baseten,openai
+#x pipeline_mode=LLM
EVA_MODEL__STT=cartesia
-# Must include "api_key" and "model" for your chosen provider:
+
+#i STT provider parameters. Must include "api_key" and "model".
+#d json_object
+#x pipeline_mode=LLM
EVA_MODEL__STT_PARAMS='{"api_key": "your_cartesia_api_key", "model": "ink-whisper"}'
-# TTS provider: cartesia | chatterbox | elevenlabs | gemini | kokoro | nvidia-baseten | openai | xtts
+# --- LLM mode: TTS ---
+#i TTS provider for the voice pipeline.
+#d enum
+#e cartesia,chatterbox,elevenlabs,gemini,kokoro,nvidia-baseten,openai,xtts
+#x pipeline_mode=LLM
EVA_MODEL__TTS=cartesia
-# Must include "api_key" and "model" for your chosen provider:
-EVA_MODEL__TTS_PARAMS='{"api_key": "your_cartesia_api_key", "model": "sonic"}'
-# For round-robin load balancing, use "urls" instead of "url":
-# EVA_MODEL__TTS_PARAMS='{"api_key": "...", "model": "sonic", "urls": ["http://server1/v1", "http://server2/v1"]}'
-
-# --- Metrics judge models ---
-# Google credentials (audio judge metrics default to Gemini)
-GOOGLE_APPLICATION_CREDENTIALS=path/to/your/service-account-credentials.json
-# AWS credentials (faithfulness metric defaults to Claude via Bedrock)
-AWS_ACCESS_KEY_ID=your_aws_access_key_id_here
-AWS_SECRET_ACCESS_KEY=your_aws_secret_access_key_here
+#i TTS provider parameters. Must include "api_key" and "model". Use "urls" for round-robin load balancing.
+#d json_object
+#x pipeline_mode=LLM
+EVA_MODEL__TTS_PARAMS='{"api_key": "your_cartesia_api_key", "model": "sonic"}'
-# If you only have an OpenAI key, you can skip the AWS credentials above and
-# override all text judge models (including faithfulness) to use OpenAI instead
-# (results may be less accurate):
-# JUDGE_MODEL=gpt-5.2
-# Audio judge metrics (agent_speech_fidelity, user_speech_fidelity) still require
-# Gemini. To skip them, run only text-based metrics, e.g.:
-# EVA_METRICS=task_completion,faithfulness,conciseness,turn_taking
+# --- S2S mode ---
+#i Speech-to-speech model name.
+#d string
+#x pipeline_mode=S2S
+#v EVA_MODEL__S2S=openai
+
+#i Speech-to-speech model parameters.
+#d json_object
+#x pipeline_mode=S2S
+#v EVA_MODEL__S2S_PARAMS='{"model": "gpt-realtime-mini", "api_key": ""}'
+
+# --- AudioLLM mode ---
+#i Audio-input LLM model name.
+#d string
+#x pipeline_mode=AudioLLM
+#v EVA_MODEL__AUDIO_LLM=
+
+#i Audio-input LLM model parameters.
+#d json_object
+#x pipeline_mode=AudioLLM
+#v EVA_MODEL__AUDIO_LLM_PARAMS='{"url": "", "api_key": ""}'
+
+# --- Framework (S2S / AudioLLM) ---
+#i Base framework for S2S or AudioLLM pipelines.
+#d enum
+#e pipecat,openai_realtime,gemini_live,elevenlabs
+#v EVA_FRAMEWORK=openai_realtime
# ==============================================
-# Required: Model Deployments
+# LiteLLM Deployments
# ==============================================
-#
-# EVA_MODEL_LIST: JSON array of LiteLLM Router deployments.
-# - model_name: alias your code uses (e.g., "gpt-5.2")
-# - litellm_params.model: provider-specific identifier (e.g., "openai/gpt-4o")
-# - Use "os.environ/VAR_NAME" syntax to reference other env vars
-#
-# EVA needs at minimum:
-# 1. An LLM for the assistant (matches EVA_MODEL__LLM below)
-# 2. Gemini for audio judge metrics
-# 3. Claude (Bedrock) for the faithfulness metric
-#
-# See docs/llm_configuration.md for more provider examples and load balancing.
+#i LiteLLM Router deployments. Use "os.environ/VAR_NAME" to reference other env vars.
+#d json_deployment_list
EVA_MODEL_LIST='[
{
"model_name": "gpt-5.2",
@@ -106,222 +150,233 @@ EVA_MODEL_LIST='[
}
]'
-# --- Optional: additional model deployments ---
-# Uncomment and add to EVA_MODEL_LIST above as needed.
-#
-# Azure OpenAI (alternative to direct OpenAI):
-# {
-# "model_name": "gpt-5.2",
-# "litellm_params": {
-# "model": "azure/gpt-5.2",
-# "api_key": "os.environ/AZURE_OPENAI_API_KEY",
-# "api_base": "https://your-resource.openai.azure.com",
-# "max_parallel_requests": 5
-# },
-# "model_info": {"base_model": "gpt-5.2"}
-# }
-#
-# OpenAI Responses API (for multi-turn encrypted reasoning on o-series / gpt-5.x models):
-# Add "use_responses_api": true at the top level of the deployment (not inside litellm_params).
-# Also set "reasoning_effort" inside litellm_params to enable reasoning.
-# {
-# "model_name": "gpt-5.2",
-# "litellm_params": {
-# "model": "azure/gpt-5.2",
-# "api_key": "os.environ/AZURE_OPENAI_API_KEY",
-# "api_base": "https://your-resource.openai.azure.com",
-# "reasoning_effort": "low",
-# "max_parallel_requests": 5
-# },
-# "use_responses_api": true
-# }
-#
-# Self-hosted model (e.g., vLLM, NVIDIA NIM):
-# {
-# "model_name": "my-model",
-# "litellm_params": {
-# "model": "openai/my-model-name",
-# "api_key": "os.environ/MY_MODEL_KEY",
-# "api_base": "http://my-server:8000/v1",
-# "max_parallel_requests": 5
-# }
-# }
-#
-# Load balancing (multiple endpoints for the same model):
-# {
-# "model_name": "my-model",
-# "litellm_params": {"model": "openai/my-model", "api_base": "http://server1:8000/v1", ...}
-# },
-# {
-# "model_name": "my-model",
-# "litellm_params": {"model": "openai/my-model", "api_base": "http://server2:8000/v1", ...}
-# }
+#i LLM model alias for the assistant. Must match a model_name in EVA_MODEL_LIST.
+#d enum
+#x pipeline_mode=LLM
+EVA_MODEL__LLM=gpt-5.2
# ==============================================
-# Required: Framework Configuration
+# Framework & Runtime
# ==============================================
-# Domain name — determines dataset, agent config, and scenario paths:
-# data/{domain}_dataset.jsonl
-# configs/agents/{domain}_agent.yaml
-# data/{domain}_scenarios/
-# The included sample domain is "airline".
-# EVA_DOMAIN=airline
+#i Domain determines dataset, agent config, and scenario paths (data/{domain}_dataset.jsonl etc).
+#d enum
+#e airline,itsm,medical_hr
+#v EVA_DOMAIN=airline
-# LLM model name — must match a model_name in EVA_MODEL_LIST above.
-EVA_MODEL__LLM=gpt-5.2
+#i Maximum number of concurrent conversations.
+#d int
+#r 1,100,1
+#v EVA_MAX_CONCURRENT_CONVERSATIONS=1
-# ==============================================
-# Optional: Alternative LLM Provider Keys
-# ==============================================
+#i Conversation timeout in seconds.
+#d int
+#r 30,10000,10
+#v EVA_CONVERSATION_TIMEOUT_SECONDS=360
-# Azure OpenAI (alternative to direct OpenAI)
-# AZURE_OPENAI_API_KEY=your_azure_openai_api_key_here
-# AZURE_OPENAI_ENDPOINT=https://your-resource.openai.azure.com/
+#i Maximum rerun attempts for failed records.
+#d int
+#r 0,20,1
+#v EVA_MAX_RERUN_ATTEMPTS=3
-# Google API key (alternative to service account credentials for Gemini)
-# GOOGLE_API_KEY=your_google_api_key_here
-
-# ==============================================
-# Optional: Speech-to-Speech / Audio-LLM Configuration
-# ==============================================
-# Only needed if benchmarking speech-to-speech models.
+#i Output directory for results.
+#d path
+#v EVA_OUTPUT_DIR=output
-# Base framework (pipecat, openai_realtime, gemini_live)
-# EVA_FRAMEWORK=openai_realtime
+#i Starting port for WebSocket servers.
+#d int
+#r 1024,65000,1
+#v EVA_BASE_PORT=10000
-# EVA_MODEL__S2S=openai
-# EVA_MODEL__S2S_PARAMS='{"model": "gpt-realtime-mini", "api_key": ""}'
+#i Number of ports in the pool.
+#d int
+#r 10,500,1
+#v EVA_PORT_POOL_SIZE=150
-# EVA_MODEL__AUDIO_LLM=
-# EVA_MODEL__AUDIO_LLM_PARAMS='{"url": "", "api_key": ""}'
+#i Comma-separated metric names to run. Leave empty to run all metrics.
+#d csv_list
+#v EVA_METRICS=
# ==============================================
-# Optional: Execution Settings
+# Turn Detection & VAD
# ==============================================
-# Maximum number of concurrent conversations (1-100, default: 1)
-# EVA_MAX_CONCURRENT_CONVERSATIONS=1
+# Leave all of these inactive to use smart defaults.
-# Conversation timeout in seconds (30-10000, default: 360)
-# EVA_CONVERSATION_TIMEOUT_SECONDS=360
+#i Turn start strategy: when to consider the user has started speaking.
+#d enum
+#e vad,transcription,external
+#v EVA_MODEL__TURN_START_STRATEGY=vad
-# Maximum number of rerun attempts for failed records (0-20, default: 3)
-# EVA_MAX_RERUN_ATTEMPTS=3
+#i Turn start strategy parameters (JSON).
+#d json_object
+#v EVA_MODEL__TURN_START_STRATEGY_PARAMS='{}'
-# Output directory for results (default: output)
-# EVA_OUTPUT_DIR=output
+#i Turn stop strategy: when to consider the user has finished speaking.
+#d enum
+#e turn_analyzer,speech_timeout,external
+#v EVA_MODEL__TURN_STOP_STRATEGY=turn_analyzer
-# Starting port for WebSocket servers (1024-65000, default: 10000)
-# EVA_BASE_PORT=10000
+#i Turn stop strategy parameters. For speech_timeout: {"user_speech_timeout": 0.8}.
+#d json_object
+#v EVA_MODEL__TURN_STOP_STRATEGY_PARAMS='{}'
-# Number of ports in the pool (10-500, default: 150)
-# EVA_PORT_POOL_SIZE=150
+#i VAD (Voice Activity Detection) analyzer.
+#d enum
+#e silero,none
+#v EVA_MODEL__VAD=silero
-# Comma-separated list of metrics to run (empty = no metrics, default: all metrics)
-# EVA_METRICS=
-
-# Debug mode: run only 1 record regardless of dataset size (true | false, default: false)
-# EVA_DEBUG=true
-
-# Comma-separated list of specific record IDs to run (empty = run all)
-# EVA_RECORD_IDS=1.2.1,1.2.2,1.3.1
-
-# Logging level (DEBUG | INFO | WARNING | ERROR | CRITICAL, default: INFO)
-# EVA_LOG_LEVEL=DEBUG
+#i VAD parameters. Keys: confidence (0-1), start_secs, stop_secs, min_volume (0-1).
+#d json_object
+#v EVA_MODEL__VAD_PARAMS='{"start_secs": 0.2, "stop_secs": 0.2, "min_volume": 0.6, "confidence": 0.7}'
# ==============================================
-# Optional: Turn Detection & VAD Configuration
+# User Config
# ==============================================
-# Fine-tune user turn detection and voice activity detection.
-# Leave commented to use smart defaults.
-
-# User turn start strategy: vad | transcription | external
-# - vad: Start turn when VAD detects speech (default)
-# - transcription: Start turn when STT produces transcription
-# - external: Delegate to external service (e.g., Deepgram Flux)
-# EVA_MODEL__TURN_START_STRATEGY=vad
-
-# User turn start strategy parameters (JSON)
-# EVA_MODEL__TURN_START_STRATEGY_PARAMS='{}'
-
-# User turn stop strategy: turn_analyzer | speech_timeout | external
-# - turn_analyzer: Use smart turn analyzer to detect natural turn end (default)
-# - speech_timeout: Stop after fixed silence duration
-# - external: Delegate to external service
-# EVA_MODEL__TURN_STOP_STRATEGY=turn_analyzer
-
-# User turn stop strategy parameters (JSON)
-# For speech_timeout: {"user_speech_timeout": 0.8}
-# For turn_analyzer: automatically uses smart turn detection
-# EVA_MODEL__TURN_STOP_STRATEGY_PARAMS='{}'
-
-# Note: For services with built-in turn detection (e.g., Deepgram Flux), set both to 'external':
-# EVA_MODEL__TURN_START_STRATEGY=external
-# EVA_MODEL__TURN_STOP_STRATEGY=external
-
-# VAD (Voice Activity Detection) analyzer: silero | none
-# EVA_MODEL__VAD=silero
-
-# VAD parameters (JSON)
-# - confidence: Minimum confidence threshold (0.0-1.0, default: 0.7)
-# - start_secs: Duration to wait before confirming voice start (default: 0.2)
-# - stop_secs: Duration to wait before confirming voice stop (default: 0.2)
-# - min_volume: Minimum audio volume threshold (0.0-1.0, default: 0.6)
-# EVA_MODEL__VAD_PARAMS='{"start_secs": 0.2, "stop_secs": 0.2, "min_volume": 0.6, "confidence": 0.7}'
-# ==============================================
-# Optional: Perturbations
-# ==============================================
-# Perturb the simulated user to stress-test the assistant under realistic
-# conditions. Four independent axes, all optional:
-#
-# 1. background_noise — ambient audio mixed into user speech
-# 2. accent — swaps the ElevenLabs user agent to an accented voice
-# 3. behavior — swaps the agent + modifies persona prompt
-# 4. connection_degradation — stacks VoIP artifacts (codec, packet loss, gain jitter)
-#
+# --- Default user simulator agents ---
+#i ElevenLabs agent ID for the default female-voice user persona.
+#d string
+EVA_DEFAULT_USER_F=your_elevenlabs_agent_id_for_default_user_f
+
+#i ElevenLabs agent ID for the default male-voice user persona.
+#d string
+EVA_DEFAULT_USER_M=your_elevenlabs_agent_id_for_default_user_m
+
+# --- Perturbations ---
# accent and behavior are MUTUALLY EXCLUSIVE (each claims the agent ID slot).
# background_noise and connection_degradation can stack with either.
+
# --- Background noise ---
-# Options: airport_gate | baby_crying | background_music | bad_connection_static |
-# coffee_shop | loud_construction | nyc_street | road_noise
-# Requires assets in assets/noise/. Download with:
-# python scripts/download_noise_assets.py
-# EVA_PERTURBATION__BACKGROUND_NOISE=coffee_shop
-#
-# Signal-to-noise ratio in dB (higher = cleaner user speech; default: 15)
-# EVA_PERTURBATION__SNR_DB=15
+# Requires assets in assets/noise/. Download with: python scripts/download_noise_assets.py
+#i Ambient noise to mix into user speech.
+#d enum
+#e airport_gate,baby_crying,background_music,bad_connection_static,coffee_shop,loud_construction,nyc_street,road_noise
+#v EVA_PERTURBATION__BACKGROUND_NOISE=coffee_shop
+
+#i Signal-to-noise ratio in dB. Higher = cleaner user speech.
+#d float
+#r 0,40,1
+#v EVA_PERTURBATION__SNR_DB=15
# --- Connection degradation ---
-# G.711 codec quantisation + gaussian static + 3% packet loss + random gain.
-# No assets required.
-# EVA_PERTURBATION__CONNECTION_DEGRADATION=false
+#i Apply G.711 codec + gaussian static + 3% packet loss + random gain.
+#d bool
+#v EVA_PERTURBATION__CONNECTION_DEGRADATION=false
+
+# --- Accent (mutually exclusive with Behavior) ---
+#i Accent to apply to the user simulator. Requires matching agent IDs below.
+#d enum
+#e french,indian,spanish,chinese
+#x perturbation_mode=Accent
+#v EVA_PERTURBATION__ACCENT=french
+
+# --- Accent agent IDs ---
+#i ElevenLabs agent ID — French accent, female voice.
+#d string
+#x perturbation_mode=Accent
+#x EVA_PERTURBATION__ACCENT=french
+#v EVA_FRENCH_ACCENT_USER_F=
+
+#i ElevenLabs agent ID — French accent, male voice.
+#d string
+#x perturbation_mode=Accent
+#x EVA_PERTURBATION__ACCENT=french
+#v EVA_FRENCH_ACCENT_USER_M=
+
+#i ElevenLabs agent ID — Indian accent, female voice.
+#d string
+#x perturbation_mode=Accent
+#x EVA_PERTURBATION__ACCENT=indian
+#v EVA_INDIAN_ACCENT_USER_F=
+
+#i ElevenLabs agent ID — Indian accent, male voice.
+#d string
+#x perturbation_mode=Accent
+#x EVA_PERTURBATION__ACCENT=indian
+#v EVA_INDIAN_ACCENT_USER_M=
+
+#i ElevenLabs agent ID — Spanish accent, female voice.
+#d string
+#x perturbation_mode=Accent
+#x EVA_PERTURBATION__ACCENT=spanish
+#v EVA_SPANISH_ACCENT_USER_F=
+
+#i ElevenLabs agent ID — Spanish accent, male voice.
+#d string
+#x perturbation_mode=Accent
+#x EVA_PERTURBATION__ACCENT=spanish
+#v EVA_SPANISH_ACCENT_USER_M=
+
+#i ElevenLabs agent ID — Chinese accent, female voice.
+#d string
+#x perturbation_mode=Accent
+#x EVA_PERTURBATION__ACCENT=chinese
+#v EVA_CHINESE_ACCENT_USER_F=
+
+#i ElevenLabs agent ID — Chinese accent, male voice.
+#d string
+#x perturbation_mode=Accent
+#x EVA_PERTURBATION__ACCENT=chinese
+#v EVA_CHINESE_ACCENT_USER_M=
+
+# --- Behavior (mutually exclusive with Accent) ---
+#i Behavior persona for the user simulator. Requires matching agent IDs below.
+#d enum
+#e aggressive_impatient,elderly_slow,forgetful_disorganized
+#x perturbation_mode=Behavior
+#v EVA_PERTURBATION__BEHAVIOR=forgetful_disorganized
+
+# --- Behavior agent IDs ---
+#i ElevenLabs agent ID — Aggressive/impatient persona, female voice.
+#d string
+#x perturbation_mode=Behavior
+#x EVA_PERTURBATION__BEHAVIOR=aggressive_impatient
+#v EVA_AGGRESSIVE_IMPATIENT_USER_F=
+
+#i ElevenLabs agent ID — Aggressive/impatient persona, male voice.
+#d string
+#x perturbation_mode=Behavior
+#x EVA_PERTURBATION__BEHAVIOR=aggressive_impatient
+#v EVA_AGGRESSIVE_IMPATIENT_USER_M=
+
+#i ElevenLabs agent ID — Elderly/slow persona, female voice.
+#d string
+#x perturbation_mode=Behavior
+#x EVA_PERTURBATION__BEHAVIOR=elderly_slow
+#v EVA_ELDERLY_SLOW_USER_F=
+
+#i ElevenLabs agent ID — Elderly/slow persona, male voice.
+#d string
+#x perturbation_mode=Behavior
+#x EVA_PERTURBATION__BEHAVIOR=elderly_slow
+#v EVA_ELDERLY_SLOW_USER_M=
+
+#i ElevenLabs agent ID — Forgetful/disorganized persona, female voice.
+#d string
+#x perturbation_mode=Behavior
+#x EVA_PERTURBATION__BEHAVIOR=forgetful_disorganized
+#v EVA_FORGETFUL_DISORGANIZED_USER_F=
+
+#i ElevenLabs agent ID — Forgetful/disorganized persona, male voice.
+#d string
+#x perturbation_mode=Behavior
+#x EVA_PERTURBATION__BEHAVIOR=forgetful_disorganized
+#v EVA_FORGETFUL_DISORGANIZED_USER_M=
-# --- Accent (mutually exclusive with behavior) ---
-# Options: french | indian | spanish | chinese
-# EVA_PERTURBATION__ACCENT=french
-#
-# Accent-specific ElevenLabs agent IDs. Pattern: EVA_{ACCENT}_ACCENT_USER_{F|M}
-# Only the pair matching EVA_PERTURBATION__ACCENT needs to be set.
-# EVA_FRENCH_ACCENT_USER_F=
-# EVA_FRENCH_ACCENT_USER_M=
-# EVA_INDIAN_ACCENT_USER_F=
-# EVA_INDIAN_ACCENT_USER_M=
-# EVA_SPANISH_ACCENT_USER_F=
-# EVA_SPANISH_ACCENT_USER_M=
-# EVA_CHINESE_ACCENT_USER_F=
-# EVA_CHINESE_ACCENT_USER_M=
-
-# --- Behavior (mutually exclusive with accent) ---
-# Options: aggressive_impatient | elderly_slow | forgetful_disorganized
-# EVA_PERTURBATION__BEHAVIOR=forgetful_disorganized
-#
-# Behavior-specific ElevenLabs agent IDs. Pattern: EVA_{BEHAVIOR}_USER_{F|M}
-# Only the pair matching EVA_PERTURBATION__BEHAVIOR needs to be set.
-# EVA_AGGRESSIVE_IMPATIENT_USER_F=
-# EVA_AGGRESSIVE_IMPATIENT_USER_M=
-# EVA_ELDERLY_SLOW_USER_F=
-# EVA_ELDERLY_SLOW_USER_M=
-# EVA_FORGETFUL_DISORGANIZED_USER_F=
-# EVA_FORGETFUL_DISORGANIZED_USER_M=
+# ==============================================
+# Debug & Logging
+# ==============================================
+
+#i Run only 1 record regardless of dataset size.
+#d bool
+#v EVA_DEBUG=false
+
+#i Comma-separated record IDs to run. Empty = run all.
+#d csv_list
+#v EVA_RECORD_IDS=
+
+#i Logging verbosity.
+#d enum
+#e DEBUG,INFO,WARNING,ERROR,CRITICAL
+#v EVA_LOG_LEVEL=INFO
diff --git a/README.md b/README.md
index 221486b3..a9d5edd1 100644
--- a/README.md
+++ b/README.md
@@ -153,6 +153,16 @@ eva \
--metrics task_completion,faithfulness,conciseness
```
+### Configuring EVA
+
+EVA includes a Streamlit config editor for building your `.env` file interactively:
+
+```bash
+streamlit run apps/config_editor.py
+```
+
+The editor covers all variables grouped by tab (API keys, voice pipeline, model deployments, runtime settings, perturbations, etc.), with proper widgets for each type. See [`apps/README.md`](apps/README.md) for details.
+
### Exploring Results
EVA includes a Streamlit analysis app for visualizing and comparing results:
diff --git a/apps/README.md b/apps/README.md
index 5ed18405..69a524c2 100644
--- a/apps/README.md
+++ b/apps/README.md
@@ -1,6 +1,20 @@
# EVA Apps
-Streamlit applications for exploring EVA results.
+Streamlit applications for exploring and configuring EVA.
+
+## Config Editor
+
+Interactive UI for building and editing `.env` configuration files without hand-editing JSON or looking up variable names.
+
+### Usage
+
+```bash
+streamlit run apps/config_editor.py
+```
+
+The app reads `.env.example` for the full variable set and loads existing values from `.env` if present. Each variable's widget type, enum options, ranges, and tooltips are declared directly in `.env.example` using annotation prefixes (`#i`, `#d`, `#e`, `#r`, `#x`, `#v`). Use the **Preview** button to inspect the generated file before saving, or **Download** to export it without writing to disk.
+
+---
## Analysis App
diff --git a/apps/analysis.py b/apps/analysis.py
index 121b8d11..101fdcdc 100644
--- a/apps/analysis.py
+++ b/apps/analysis.py
@@ -1585,7 +1585,7 @@ def _pivot_metric_label(m: str) -> str:
height=max(350, 80 + 40 * len(y_labels)),
margin={"l": 20, "r": 20, "t": 50, "b": 120},
)
- st.plotly_chart(heatmap_fig, use_container_width=True)
+ st.plotly_chart(heatmap_fig, width="stretch")
def render_run_overview(run_dir: Path):
diff --git a/apps/config_editor.py b/apps/config_editor.py
new file mode 100644
index 00000000..f59995b4
--- /dev/null
+++ b/apps/config_editor.py
@@ -0,0 +1,655 @@
+"""Streamlit app for editing EVA's .env file with a friendly UI.
+
+Run with:
+
+ streamlit run apps/config_editor.py
+
+The app reads .env.example to discover variables and their metadata
+(widget type, options, ranges, tooltips, conditions) from annotation
+prefixes (#i, #d, #e, #r, #g, #x, #v). .env is read on startup to
+prefill values and written on save.
+"""
+
+from __future__ import annotations
+
+import html as html_module
+import json
+import sys
+from pathlib import Path
+from typing import Any
+
+import streamlit as st
+import streamlit.components.v1 as st_components
+from config_io import (
+ AnnotatedVar,
+ ParsedEnvExample,
+ compute_disabled,
+ load_env,
+ parse_env_example,
+ serialize_env,
+)
+from config_schema import (
+ GROUP_API_CONFIGS,
+ GROUP_DEPLOYMENTS,
+ GROUP_MISC,
+ GROUP_PERTURBATIONS,
+ GROUP_RUNTIME,
+ GROUPS,
+ MUTEX_RADIOS,
+)
+
+REPO_ROOT = Path(__file__).resolve().parents[1]
+sys.path.insert(0, str(REPO_ROOT))
+
+ENV_EXAMPLE_PATH = REPO_ROOT / ".env.example"
+ENV_PATH = REPO_ROOT / ".env"
+
+
+# ---------------------------------------------------------------------------
+# Initialisation
+# ---------------------------------------------------------------------------
+
+
+def _coerce(widget: str, raw: str) -> Any:
+ if not raw:
+ return _empty_for(widget)
+ raw = raw.strip()
+ try:
+ if widget == "bool":
+ return raw.lower() in ("true", "1", "yes", "on")
+ if widget == "int":
+ return int(raw)
+ if widget == "float":
+ return float(raw)
+ if widget == "csv_list":
+ return [x.strip() for x in raw.split(",") if x.strip()]
+ if widget in ("json_object", "json_deployment_list"):
+ s = raw.strip()
+ if s.startswith("'") and s.endswith("'"):
+ s = s[1:-1]
+ return json.loads(s)
+ except Exception:
+ return _empty_for(widget)
+ return raw
+
+
+def _empty_for(widget: str) -> Any:
+ if widget == "bool":
+ return False
+ if widget in ("int", "float"):
+ return None
+ if widget in ("csv_list", "json_deployment_list"):
+ return []
+ if widget == "json_object":
+ return {}
+ return ""
+
+
+def _detect_pipeline_mode(env: dict[str, str]) -> str:
+ if env.get("EVA_MODEL__S2S"):
+ return "S2S"
+ if env.get("EVA_MODEL__AUDIO_LLM"):
+ return "AudioLLM"
+ return "LLM"
+
+
+def _detect_perturbation_mode(env: dict[str, str]) -> str:
+ if env.get("EVA_PERTURBATION__ACCENT"):
+ return "Accent"
+ if env.get("EVA_PERTURBATION__BEHAVIOR"):
+ return "Behavior"
+ return "None"
+
+
+def _init_state() -> None:
+ if "initialized" in st.session_state:
+ return
+ parsed = parse_env_example(ENV_EXAMPLE_PATH)
+ st.session_state.parsed = parsed
+ existing = load_env(ENV_PATH)
+ values: dict[str, Any] = {}
+ for var in parsed.vars:
+ raw = existing.get(var.name)
+ if raw is None and var.is_active:
+ raw = var.example_value.strip().strip("'\"")
+ values[var.name] = _coerce(var.widget, raw or "")
+ for name, raw in existing.items():
+ if name not in {v.name for v in parsed.vars}:
+ values[name] = raw
+ st.session_state.field_values = values
+ st.session_state.loaded_keys = set(existing.keys())
+ st.session_state.pipeline_mode = _detect_pipeline_mode(existing)
+ st.session_state.perturbation_mode = _detect_perturbation_mode(existing)
+ # Initialise all mutex radio states
+ for mx in MUTEX_RADIOS:
+ if mx.state_key not in st.session_state:
+ st.session_state[mx.state_key] = st.session_state.get(mx.state_key, mx.default)
+ st.session_state.initialized = True
+
+
+# ---------------------------------------------------------------------------
+# Visibility
+# ---------------------------------------------------------------------------
+
+
+def _is_visible_av(var: AnnotatedVar) -> bool:
+ """Return True when all #x conditions for this var are satisfied."""
+ for cond_key, cond_val in var.conditions:
+ actual = st.session_state.get(cond_key)
+ if actual is None:
+ actual = st.session_state.get("field_values", {}).get(cond_key)
+ if actual != cond_val:
+ return False
+ return True
+
+
+# ---------------------------------------------------------------------------
+# Widget renderers
+# ---------------------------------------------------------------------------
+
+
+def _render_annotated_var(var: AnnotatedVar) -> None:
+ if not _is_visible_av(var):
+ return
+ values = st.session_state.field_values
+ current = values.get(var.name)
+ help_text = var.info or None
+
+ if var.widget in ("string", "path"):
+ values[var.name] = st.text_input(var.name, value=current or "", help=help_text, key=f"w_{var.name}")
+ elif var.widget == "secret":
+ values[var.name] = st.text_input(
+ var.name, value=current or "", help=help_text, type="password", key=f"w_{var.name}"
+ )
+ elif var.widget == "bool":
+ values[var.name] = st.checkbox(var.name, value=bool(current), help=help_text, key=f"w_{var.name}")
+ elif var.widget == "int":
+ rng = var.range
+ v = current if isinstance(current, int) else (int(rng[0]) if rng else 0)
+ values[var.name] = st.number_input(
+ var.name,
+ value=v,
+ min_value=int(rng[0]) if rng else None,
+ max_value=int(rng[1]) if rng and len(rng) > 1 else None,
+ step=int(rng[2]) if rng and len(rng) > 2 else 1,
+ help=help_text,
+ key=f"w_{var.name}",
+ )
+ elif var.widget == "float":
+ rng = var.range
+ v = float(current) if isinstance(current, (int, float)) else (float(rng[0]) if rng else 0.0)
+ values[var.name] = st.number_input(
+ var.name,
+ value=float(v),
+ min_value=float(rng[0]) if rng else None,
+ max_value=float(rng[1]) if rng and len(rng) > 1 else None,
+ step=float(rng[2]) if rng and len(rng) > 2 else 0.1,
+ help=help_text,
+ key=f"w_{var.name}",
+ )
+ elif var.widget == "enum":
+ options = _enum_options_for(var)
+ display = ["(unset)"] + options
+ idx = (options.index(current) + 1) if current in options else 0
+ choice = st.selectbox(var.name, display, index=idx, help=help_text, key=f"w_{var.name}")
+ values[var.name] = "" if choice == "(unset)" else choice
+ elif var.widget == "multi_enum":
+ choices = current if isinstance(current, list) else []
+ values[var.name] = st.multiselect(var.name, var.options, default=choices, help=help_text, key=f"w_{var.name}")
+ elif var.widget == "csv_list":
+ as_text = ",".join(current) if isinstance(current, list) else (current or "")
+ text = st.text_input(var.name, value=as_text, help=help_text, key=f"w_{var.name}")
+ values[var.name] = [x.strip() for x in text.split(",") if x.strip()]
+ elif var.widget == "json_object":
+ _render_json_object(var.name, var.info, current or {})
+ elif var.widget == "json_deployment_list":
+ _render_deployment_list(var.name, var.info, current or [])
+
+
+def _enum_options_for(var: AnnotatedVar) -> list[str]:
+ if var.name == "EVA_MODEL__LLM":
+ deployments = st.session_state.field_values.get("EVA_MODEL_LIST") or []
+ return sorted({d.get("model_name", "") for d in deployments if isinstance(d, dict)} - {""})
+ return var.options
+
+
+def _render_json_object(name: str, info: str, current: dict) -> None:
+ st.markdown(f"**{name}**" + (f" — {info}" if info else ""))
+ raw_key = f"raw_{name}"
+ if raw_key not in st.session_state:
+ st.session_state[raw_key] = json.dumps(current, indent=2) if current else ""
+
+ rows = [{"key": k, "value": _scalar_to_str(v)} for k, v in current.items()] or [{"key": "", "value": ""}]
+ edited = st.data_editor(
+ rows,
+ num_rows="dynamic",
+ width="stretch",
+ column_config={
+ "key": st.column_config.TextColumn("key", required=False),
+ "value": st.column_config.TextColumn("value", required=False),
+ },
+ key=f"de_{name}",
+ )
+ parsed_kv: dict[str, Any] = {}
+ for row in edited:
+ k = (row.get("key") or "").strip()
+ if k:
+ parsed_kv[k] = _str_to_scalar(row.get("value"))
+
+ with st.expander("Raw JSON", expanded=False):
+ text = st.text_area(
+ "Edit as JSON", value=json.dumps(parsed_kv, indent=2) if parsed_kv else "", key=raw_key, height=140
+ )
+ if text.strip():
+ try:
+ parsed_kv = json.loads(text)
+ except json.JSONDecodeError as e:
+ st.warning(f"Invalid JSON: {e}")
+
+ st.session_state.field_values[name] = parsed_kv
+
+
+def _scalar_to_str(v: Any) -> str:
+ if isinstance(v, (dict, list)):
+ return json.dumps(v)
+ if isinstance(v, bool):
+ return "true" if v else "false"
+ if v is None:
+ return ""
+ return str(v)
+
+
+def _str_to_scalar(v: Any) -> Any:
+ if not isinstance(v, str):
+ return v
+ s = v.strip()
+ if s == "":
+ return ""
+ if s.lower() in ("true", "false"):
+ return s.lower() == "true"
+ if s.startswith(("{", "[")):
+ try:
+ return json.loads(s)
+ except json.JSONDecodeError:
+ return s
+ try:
+ return int(s) if "." not in s else float(s)
+ except ValueError:
+ return s
+
+
+def _render_deployment_list(name: str, info: str, current: list) -> None:
+ st.markdown(f"**{name}**" + (f" — {info}" if info else ""))
+ deployments: list[dict] = [d for d in current if isinstance(d, dict)]
+
+ st.caption("All deployments — add / remove rows here, then select one below to edit its params.")
+ summary_rows = [
+ {"model_name": d.get("model_name", ""), "provider/model": (d.get("litellm_params") or {}).get("model", "")}
+ for d in deployments
+ ] or [{"model_name": "", "provider/model": ""}]
+
+ edited_summary = st.data_editor(
+ summary_rows,
+ num_rows="dynamic",
+ width="stretch",
+ column_config={
+ "model_name": st.column_config.TextColumn("model_name (alias)", required=False),
+ "provider/model": st.column_config.TextColumn("provider/model (litellm_params.model)", required=False),
+ },
+ key=f"de_summary_{name}",
+ )
+
+ old_by_name = {d.get("model_name", ""): d for d in deployments}
+ merged: list[dict] = []
+ for idx, row in enumerate(edited_summary):
+ rname = (row.get("model_name") or "").strip()
+ if not rname:
+ continue
+ base = dict(
+ old_by_name.get(rname) or old_by_name.get(list(old_by_name)[idx] if idx < len(old_by_name) else "") or {}
+ )
+ base["model_name"] = rname
+ lp = dict(base.get("litellm_params") or {})
+ pm = (row.get("provider/model") or "").strip()
+ if pm:
+ lp["model"] = pm
+ base["litellm_params"] = lp
+ merged.append(base)
+ deployments = merged
+
+ model_names = [d.get("model_name", "") for d in deployments if d.get("model_name")]
+ if not model_names:
+ st.session_state.field_values[name] = deployments
+ return
+
+ sel_key = f"_depl_sel_{name}"
+ prev = st.session_state.get(sel_key)
+ default_idx = model_names.index(prev) if prev in model_names else 0
+ selected = st.selectbox("Edit deployment", options=model_names, index=default_idx, key=sel_key)
+
+ sel_idx = model_names.index(selected)
+ depl = deployments[sel_idx]
+
+ st.markdown("**litellm_params**")
+ lp = depl.get("litellm_params") or {}
+ lp_rows = [{"key": k, "value": _scalar_to_str(v)} for k, v in lp.items()] or [{"key": "", "value": ""}]
+ edited_lp = st.data_editor(
+ lp_rows,
+ num_rows="dynamic",
+ width="stretch",
+ column_config={
+ "key": st.column_config.TextColumn("key", required=False),
+ "value": st.column_config.TextColumn("value", required=False),
+ },
+ key=f"de_lp_{name}_{selected}",
+ )
+ new_lp: dict[str, Any] = {
+ (r.get("key") or "").strip(): _str_to_scalar(r.get("value")) for r in edited_lp if (r.get("key") or "").strip()
+ }
+ depl["litellm_params"] = new_lp
+
+ extra_fields = {k: v for k, v in depl.items() if k not in ("model_name", "litellm_params")}
+ if extra_fields or st.checkbox("Add extra top-level fields", key=f"_extra_chk_{name}_{selected}"):
+ st.markdown("**Extra top-level fields** (e.g. `model_info`)")
+ extra_rows = [{"key": k, "value": _scalar_to_str(v)} for k, v in extra_fields.items()] or [
+ {"key": "", "value": ""}
+ ]
+ edited_extra = st.data_editor(
+ extra_rows,
+ num_rows="dynamic",
+ width="stretch",
+ column_config={
+ "key": st.column_config.TextColumn("key", required=False),
+ "value": st.column_config.TextColumn("value", required=False),
+ },
+ key=f"de_extra_{name}_{selected}",
+ )
+ edited_extra_keys = {(r.get("key") or "").strip() for r in edited_extra if (r.get("key") or "").strip()}
+ for k in list(extra_fields):
+ if k not in edited_extra_keys:
+ depl.pop(k, None)
+ for r in edited_extra:
+ k = (r.get("key") or "").strip()
+ if k:
+ depl[k] = _str_to_scalar(r.get("value"))
+
+ deployments[sel_idx] = depl
+ st.session_state.field_values[name] = deployments
+
+
+# ---------------------------------------------------------------------------
+# Group / tab rendering
+# ---------------------------------------------------------------------------
+
+
+def _auto_group_for(name: str) -> str | None:
+ """Auto-assign group for vars not in the parsed template."""
+ if "KEY" in name or "URL" in name:
+ return GROUP_API_CONFIGS
+ if "_USER" in name and "DEFAULT" not in name:
+ return GROUP_PERTURBATIONS
+ if name.startswith("EVA_"):
+ return GROUP_RUNTIME
+ return None
+
+
+def _render_unmapped_var(name: str) -> None:
+ values = st.session_state.field_values
+ v = values.get(name, "")
+ if not isinstance(v, str):
+ v = json.dumps(v) if v else ""
+ widget_type = "password" if "KEY" in name else "default"
+ values[name] = st.text_input(name, value=v, key=f"w_{name}", type=widget_type)
+
+
+def _render_add_var_widget(context: str) -> None:
+ st.divider()
+ st.markdown("**Add a new variable**")
+ counter_key = f"_add_var_counter_{context}"
+ if counter_key not in st.session_state:
+ st.session_state[counter_key] = 0
+ input_key = f"_add_var_input_{context}_{st.session_state[counter_key]}"
+ col_input, col_btn = st.columns([4, 1])
+ with col_input:
+ new_name = st.text_input(
+ "Variable name",
+ key=input_key,
+ label_visibility="collapsed",
+ placeholder="e.g. MY_API_KEY",
+ )
+ with col_btn:
+ if st.button("Add", key=f"_add_var_btn_{context}", width="stretch"):
+ name = new_name.strip().upper()
+ if not name:
+ st.warning("Please enter a variable name.")
+ elif name.startswith("EVA_"):
+ st.error(
+ "`EVA_*` variables are managed via `apps/config_schema.py`. Add it there to get a proper widget."
+ )
+ elif name in st.session_state.field_values or name in {v.name for v in st.session_state.parsed.vars}:
+ st.warning(f"`{name}` already exists.")
+ else:
+ st.session_state.field_values[name] = ""
+ st.session_state[counter_key] += 1
+ st.rerun()
+ st.caption(
+ "Variables containing **KEY** or **URL** are placed under *API Configs*. "
+ "Everything else stays here. `EVA_*` variables cannot be added here — "
+ "add them to `.env.example` instead."
+ )
+
+
+def _render_group(group: str) -> None:
+ parsed: ParsedEnvExample = st.session_state.parsed
+
+ # Render mutex radio buttons for this group
+ for mx in MUTEX_RADIOS:
+ if mx.group == group:
+ options = mx.options
+ current = st.session_state.get(mx.state_key, mx.default)
+ idx = options.index(current) if current in options else 0
+ st.session_state[mx.state_key] = st.radio(
+ mx.label,
+ options=options,
+ index=idx,
+ horizontal=True,
+ help=mx.help,
+ key=f"radio_{mx.state_key}",
+ )
+ st.divider()
+
+ # Template vars for this group
+ group_vars = [v for v in parsed.vars if v.group == group]
+
+ # Auto-routed unmapped vars (from loaded .env, not in template)
+ all_known = set(parsed.by_name)
+ auto_names = [n for n in st.session_state.field_values if n not in all_known and _auto_group_for(n) == group]
+
+ if group == GROUP_API_CONFIGS:
+ # Sort alphabetically so KEYs and URLs cluster
+ schema_map = {v.name: v for v in group_vars}
+ for name in sorted(set(schema_map) | set(auto_names)):
+ if name in schema_map:
+ _render_annotated_var(schema_map[name])
+ else:
+ _render_unmapped_var(name)
+ _render_add_var_widget("api")
+ else:
+ for var in group_vars:
+ _render_annotated_var(var)
+ for name in auto_names:
+ _render_unmapped_var(name)
+
+ # Cross-field validation for deployments tab
+ if group == GROUP_DEPLOYMENTS:
+ deployments = st.session_state.field_values.get("EVA_MODEL_LIST") or []
+ chosen = st.session_state.field_values.get("EVA_MODEL__LLM")
+ names = {d.get("model_name") for d in deployments if isinstance(d, dict)}
+ if chosen and chosen not in names:
+ st.error(
+ f"EVA_MODEL__LLM = `{chosen}` does not match any deployment in EVA_MODEL_LIST. "
+ "Add it above or pick a different alias."
+ )
+
+
+def _render_misc_tab(parsed: ParsedEnvExample) -> None:
+ known = set(parsed.by_name)
+ truly_misc = [n for n in st.session_state.field_values if n not in known and _auto_group_for(n) is None]
+ # Also add template vars with no group assignment
+ for var in parsed.vars:
+ if var.group is None and var.name not in list(truly_misc):
+ truly_misc.append(var.name)
+
+ if not truly_misc:
+ st.info("No unmapped variables. 🎉")
+ else:
+ st.warning(
+ f"Found {len(truly_misc)} variable(s) not covered by the template. "
+ "Add them to `.env.example` for proper widgets."
+ )
+ for name in truly_misc:
+ _render_unmapped_var(name)
+ _render_add_var_widget("misc")
+
+
+# ---------------------------------------------------------------------------
+# Serialization
+# ---------------------------------------------------------------------------
+
+
+def _is_meaningful(name: str, value: Any) -> bool:
+ if name in st.session_state.get("loaded_keys", set()):
+ return True
+ if value is None:
+ return False
+ if isinstance(value, str) and value == "":
+ return False
+ if isinstance(value, bool) and value is False:
+ return False
+ if isinstance(value, (list, dict)) and len(value) == 0:
+ return False
+ if isinstance(value, (int, float)) and value == 0:
+ return False
+ return True
+
+
+def _build_serialized() -> str:
+ values = {k: v for k, v in st.session_state.field_values.items() if _is_meaningful(k, v)}
+ parsed: ParsedEnvExample = st.session_state.parsed
+ known = set(parsed.by_name)
+ # csv_list → comma-separated string for serializer
+ for var in parsed.vars:
+ if var.widget == "csv_list" and isinstance(values.get(var.name), list):
+ values[var.name] = ",".join(values[var.name])
+ # Collect current mode state for condition evaluation
+ mode_state: dict[str, str] = {}
+ for mx in MUTEX_RADIOS:
+ mode_state[mx.state_key] = st.session_state.get(mx.state_key, mx.default)
+ mode_state.update({k: str(v) for k, v in values.items() if isinstance(v, str)})
+ disabled = compute_disabled(parsed, **mode_state)
+ # Split extras by auto-routing: inline into their parent section or fall through to Misc
+ extras = {k: v for k, v in values.items() if k not in known}
+ api_extras = {k: v for k, v in extras.items() if _auto_group_for(k) == GROUP_API_CONFIGS}
+ runtime_extras = {k: v for k, v in extras.items() if _auto_group_for(k) == GROUP_RUNTIME}
+ section_extras: dict[str, dict] = {}
+ if api_extras:
+ section_extras[GROUP_API_CONFIGS] = dict(sorted(api_extras.items()))
+ if runtime_extras:
+ section_extras[GROUP_RUNTIME] = dict(sorted(runtime_extras.items()))
+ # vars with no auto-route → auto-collected into Misc by serialize_env
+ serializer_values = {k: v for k, v in values.items() if k in known or _auto_group_for(k) is None}
+ return serialize_env(serializer_values, parsed, disabled=disabled, section_extras=section_extras or None)
+
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+
+
+def main() -> None:
+ st.set_page_config(page_title="EVA Config Editor", layout="wide", page_icon="⚙️")
+ _init_state()
+
+ st.markdown(
+ """
+
+ """,
+ unsafe_allow_html=True,
+ )
+
+ st.title("EVA Config Editor")
+ if not ENV_PATH.exists():
+ st.info(
+ f"No `.env` file found at `{ENV_PATH.relative_to(REPO_ROOT)}`. "
+ "Fill in your values below and click **Save to .env** to create it.",
+ icon="ℹ️",
+ )
+ st.caption(
+ f"Reading variable set from `{ENV_EXAMPLE_PATH.relative_to(REPO_ROOT)}`. "
+ + (
+ f"Loaded existing values from `{ENV_PATH.relative_to(REPO_ROOT)}`."
+ if ENV_PATH.exists()
+ else "Defaults seeded from `.env.example`."
+ )
+ )
+
+ left, right = st.columns([2, 1], gap="large")
+
+ with left:
+ tabs = st.tabs(GROUPS + [GROUP_MISC])
+ for tab, group in zip(tabs[:-1], GROUPS):
+ with tab:
+ _render_group(group)
+ with tabs[-1]:
+ _render_misc_tab(st.session_state.parsed)
+
+ with right:
+ st.subheader("Preview & Save")
+ text = _build_serialized()
+ st.download_button(
+ "⬇️ Download .env",
+ data=text,
+ file_name=".env",
+ mime="text/plain",
+ width="stretch",
+ )
+ data_attr = html_module.escape(json.dumps(text), quote=True)
+ st_components.html(
+ f"""
+
+ """,
+ height=42,
+ )
+ if st.button("💾 Save to .env", width="stretch", type="primary"):
+ ENV_PATH.write_text(text)
+ st.success(f"Wrote {ENV_PATH}")
+ if st.button("👁️ View preview", width="stretch"):
+ _show_preview(text)
+
+
+@st.dialog("Preview .env", width="large")
+def _show_preview(text: str) -> None:
+ st.code(text, language="ini")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/apps/config_io.py b/apps/config_io.py
new file mode 100644
index 00000000..dbf53f0b
--- /dev/null
+++ b/apps/config_io.py
@@ -0,0 +1,391 @@
+"""Parsing and serialization helpers for the EVA config editor.
+
+Annotation prefix scheme for .env.example:
+
+ # True comment — ignored by editor, preserved verbatim.
+ #i Info/tooltip text for the following variable.
+ #d Widget datatype: secret|bool|int|float|string|path|enum|
+ multi_enum|csv_list|json_object|json_deployment_list
+ #e Comma-separated enum options for enum/multi_enum.
+ #r Numeric range: min,max or min,max,step
+ #g Override tab/group assignment for this variable.
+ #x Visibility condition VAR=value (AND semantics; multiple lines ok).
+ #v Inactive variable definition (off by default, fully configurable).
+"""
+
+from __future__ import annotations
+
+import json
+import re
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any
+
+_NAME_RE = re.compile(r"^[A-Za-z_][A-Za-z0-9_]*$")
+
+
+@dataclass
+class AnnotatedVar:
+ name: str
+ is_active: bool # False = declared with #v
+ example_value: str # raw default from file
+ widget: str # from #d or inferred
+ info: str # from #i lines (joined)
+ options: list[str] # from #e
+ range: tuple[float, ...] | None # (min, max[, step]) from #r
+ group: str | None # from #g or section header
+ conditions: list[tuple[str, str]] # from #x lines (AND semantics)
+ line_start: int
+ line_end: int
+
+
+@dataclass
+class ParsedEnvExample:
+ lines: list[str]
+ vars: list[AnnotatedVar]
+ by_name: dict[str, AnnotatedVar] = field(default_factory=dict)
+
+ def __post_init__(self) -> None:
+ if not self.by_name:
+ self.by_name = {v.name: v for v in self.vars}
+
+ # ── back-compat shim so old tests still compile ───────────────────────
+ @property
+ def specs(self) -> list[AnnotatedVar]:
+ return self.vars
+
+
+def _is_section_rule(line: str) -> bool:
+ s = line.strip()
+ return bool(re.match(r"^\s*#\s*={3,}\s*$", s))
+
+
+def _consume_quoted_continuation(lines: list[str], start_idx: int, value_head: str) -> int:
+ """If value_head opens an unterminated single/double-quoted string, scan forward."""
+ stripped = value_head.strip()
+ if not stripped:
+ return start_idx
+ quote = stripped[0]
+ if quote not in ("'", '"'):
+ return start_idx
+ rest = stripped[1:]
+ if quote in rest:
+ return start_idx
+ for j in range(start_idx + 1, len(lines)):
+ if quote in lines[j]:
+ return j
+ return len(lines) - 1
+
+
+def _infer_widget(name: str, value: str) -> str:
+ """Best-effort widget type from variable name and example value."""
+ n = name.upper()
+ v = value.strip().lower()
+ if any(x in n for x in ("KEY", "SECRET", "TOKEN", "PASSWORD")):
+ return "secret"
+ if "CREDENTIALS" in n or n.endswith("_PATH") or n.endswith("_DIR"):
+ return "path"
+ if v in ("true", "false"):
+ return "bool"
+ raw = v.strip("'\"")
+ if raw.startswith("["):
+ return "json_deployment_list" if "model_name" in raw else "json_object"
+ if raw.startswith("{"):
+ return "json_object"
+ try:
+ int(raw)
+ return "int"
+ except ValueError:
+ pass
+ try:
+ float(raw)
+ return "float"
+ except ValueError:
+ pass
+ return "string"
+
+
+def parse_env_example(path: str | Path) -> ParsedEnvExample:
+ """Parse a .env.example file that uses the annotation prefix scheme."""
+ text = Path(path).read_text()
+ raw_lines = text.splitlines(keepends=False)
+
+ vars_list: list[AnnotatedVar] = []
+ seen: set[str] = set()
+ current_section: str | None = None
+
+ ann_info: list[str] = []
+ ann_widget: str | None = None
+ ann_options: list[str] = []
+ ann_range: tuple[float, ...] | None = None
+ ann_group: str | None = None
+ ann_conditions: list[tuple[str, str]] = []
+
+ def reset_ann() -> None:
+ nonlocal ann_info, ann_widget, ann_options, ann_range, ann_group, ann_conditions
+ ann_info = []
+ ann_widget = None
+ ann_options = []
+ ann_range = None
+ ann_group = None
+ ann_conditions = []
+
+ def emit_var(name: str, is_active: bool, value_head: str, line_start: int) -> int:
+ end_idx = _consume_quoted_continuation(raw_lines, line_start, value_head)
+ raw_value = (
+ "\n".join([value_head, *raw_lines[line_start + 1 : end_idx + 1]]) if end_idx > line_start else value_head
+ )
+ widget = ann_widget or _infer_widget(name, raw_value)
+ vars_list.append(
+ AnnotatedVar(
+ name=name,
+ is_active=is_active,
+ example_value=raw_value,
+ widget=widget,
+ info=" ".join(ann_info),
+ options=list(ann_options),
+ range=ann_range,
+ group=ann_group or current_section,
+ conditions=list(ann_conditions),
+ line_start=line_start,
+ line_end=end_idx,
+ )
+ )
+ seen.add(name)
+ reset_ann()
+ return end_idx
+
+ i = 0
+ while i < len(raw_lines):
+ line = raw_lines[i]
+ stripped = line.strip()
+
+ # Section header block (# ===...=== / # Title / # ===...===)
+ if _is_section_rule(line):
+ if i + 1 < len(raw_lines):
+ inner = raw_lines[i + 1].lstrip("#").strip()
+ if inner and not _is_section_rule(raw_lines[i + 1]):
+ current_section = inner
+ reset_ann()
+ j = i + 1
+ while j < len(raw_lines) and not _is_section_rule(raw_lines[j]):
+ j += 1
+ i = j + 1 if j < len(raw_lines) else j
+ continue
+
+ # Annotation lines — accumulate until next variable or reset
+ if stripped.startswith("#i "):
+ ann_info.append(stripped[3:].strip())
+ i += 1
+ continue
+ if stripped.startswith("#d "):
+ ann_widget = stripped[3:].strip()
+ i += 1
+ continue
+ if stripped.startswith("#e "):
+ ann_options = [o.strip() for o in stripped[3:].split(",") if o.strip()]
+ i += 1
+ continue
+ if stripped.startswith("#r "):
+ parts = [p.strip() for p in stripped[3:].split(",")]
+ try:
+ ann_range = tuple(float(p) for p in parts[:3]) # type: ignore[assignment]
+ except ValueError:
+ pass
+ i += 1
+ continue
+ if stripped.startswith("#g "):
+ ann_group = stripped[3:].strip()
+ i += 1
+ continue
+ if stripped.startswith("#x "):
+ cond = stripped[3:].strip()
+ if "=" in cond:
+ k, _, v = cond.partition("=")
+ ann_conditions.append((k.strip(), v.strip()))
+ i += 1
+ continue
+
+ # Inactive variable: #v NAME=value
+ if stripped.startswith("#v "):
+ rest = stripped[3:].strip()
+ if "=" in rest:
+ name, _, value_head = rest.partition("=")
+ name = name.strip()
+ if _NAME_RE.match(name) and name not in seen:
+ end_idx = emit_var(name, False, value_head, i)
+ i = end_idx + 1
+ continue
+ reset_ann()
+ i += 1
+ continue
+
+ # Active variable: NAME=value (no leading #)
+ if not stripped.startswith("#") and "=" in stripped:
+ name, _, value_head = stripped.partition("=")
+ name = name.strip()
+ if _NAME_RE.match(name) and name not in seen:
+ end_idx = emit_var(name, True, value_head, i)
+ i = end_idx + 1
+ continue
+
+ # True comment or blank — reset annotation accumulator
+ reset_ann()
+ i += 1
+
+ return ParsedEnvExample(lines=raw_lines, vars=vars_list)
+
+
+def load_env(path: str | Path) -> dict[str, str]:
+ """Read an existing .env into a flat {NAME: value} dict.
+
+ Commented-out lines (including #v lines) are skipped.
+ Values have surrounding quotes stripped.
+ """
+ p = Path(path)
+ if not p.exists():
+ return {}
+ out: dict[str, str] = {}
+ i = 0
+ lines = p.read_text().splitlines(keepends=False)
+ while i < len(lines):
+ line = lines[i]
+ stripped = line.strip()
+ if stripped.startswith("#") or not stripped:
+ i += 1
+ continue
+ if "=" in stripped:
+ name, _, value_head = stripped.partition("=")
+ name = name.strip()
+ if _NAME_RE.match(name):
+ end_idx = _consume_quoted_continuation(lines, i, value_head)
+ raw = "\n".join([value_head, *lines[i + 1 : end_idx + 1]]) if end_idx > i else value_head
+ out[name] = _unquote(raw.strip())
+ i = end_idx + 1
+ continue
+ i += 1
+ return out
+
+
+def _unquote(value: str) -> str:
+ if len(value) >= 2 and value[0] == value[-1] and value[0] in ("'", '"'):
+ return value[1:-1]
+ return value
+
+
+def _format_value(value: Any) -> str:
+ if isinstance(value, bool):
+ return "true" if value else "false"
+ if isinstance(value, (int, float)):
+ return str(value)
+ if isinstance(value, (dict, list)):
+ return f"'{json.dumps(value)}'"
+ s = str(value)
+ if not s:
+ return ""
+ if any(c in s for c in (" ", "\t", "#", "'", '"', "$", "\n")):
+ if "'" not in s:
+ return f"'{s}'"
+ return json.dumps(s)
+ return s
+
+
+def _has_value(v: Any) -> bool:
+ if v is None:
+ return False
+ if isinstance(v, str) and v == "":
+ return False
+ if isinstance(v, (list, dict)) and len(v) == 0:
+ return False
+ return True
+
+
+def serialize_env(
+ values: dict[str, Any],
+ parsed: ParsedEnvExample,
+ disabled: set[str] | None = None,
+ section_extras: dict[str, dict[str, Any]] | None = None,
+) -> str:
+ """Produce a .env text using parsed as the structural template.
+
+ - Variables in values with a user-set entry → emitted as NAME=value (active).
+ - Variables in disabled with a value → emitted as #v NAME=value (inactive, value preserved).
+ - Everything else → original line(s) from the template verbatim.
+ - section_extras: {section_title: {name: value}} injected inline at the end of each
+ named section (just before the next section header starts).
+ - Any values not in the template and not in section_extras are auto-appended as Misc.
+ """
+ disabled = disabled or set()
+ section_extras = section_extras or {}
+ out: list[str] = []
+ handled: set[str] = set()
+ var_by_start = {v.line_start: v for v in parsed.vars}
+ current_section: str | None = None
+
+ def _flush_extras(section: str | None) -> None:
+ if not section or section not in section_extras:
+ return
+ for name, val in section_extras[section].items():
+ if _has_value(val):
+ out.append(f"{name}={_format_value(val)}")
+
+ i = 0
+ while i < len(parsed.lines):
+ line = parsed.lines[i]
+
+ # Detect the opening rule of a new section (rule whose next line is the title)
+ if _is_section_rule(line):
+ next_line = parsed.lines[i + 1] if i + 1 < len(parsed.lines) else ""
+ next_content = next_line.lstrip("#").strip()
+ if next_content and not _is_section_rule(next_line):
+ # Flush extras for the section we're leaving before writing the new header
+ _flush_extras(current_section)
+ current_section = next_content
+
+ if i in var_by_start:
+ var = var_by_start[i]
+ user_value = values.get(var.name)
+ if var.name in disabled:
+ if _has_value(user_value):
+ out.append(f"#v {var.name}={_format_value(user_value)}")
+ else:
+ out.append(f"#v {var.name}={var.example_value.strip()}")
+ elif _has_value(user_value):
+ out.append(f"{var.name}={_format_value(user_value)}")
+ else:
+ out.extend(parsed.lines[var.line_start : var.line_end + 1])
+ handled.add(var.name)
+ i = var.line_end + 1
+ continue
+
+ out.append(line)
+ i += 1
+
+ # Flush extras for the final section
+ _flush_extras(current_section)
+
+ # Auto-collect any values not in the template into a Misc section
+ extras = [name for name in values if name not in handled and _has_value(values[name])]
+ if extras:
+ out.append("")
+ out.append("# ==============================================")
+ out.append("# Misc / Unmapped (added by config editor)")
+ out.append("# ==============================================")
+ for name in extras:
+ out.append(f"{name}={_format_value(values[name])}")
+
+ return "\n".join(out) + "\n"
+
+
+def compute_disabled(parsed: ParsedEnvExample, **state_values: str) -> set[str]:
+ """Return names of vars whose #x conditions are not all satisfied.
+
+ Pass mode keys as kwargs, e.g. compute_disabled(parsed, pipeline_mode="LLM").
+ """
+ disabled: set[str] = set()
+ for var in parsed.vars:
+ for cond_key, cond_val in var.conditions:
+ if state_values.get(cond_key, "") != cond_val:
+ disabled.add(var.name)
+ break
+ return disabled
diff --git a/apps/config_schema.py b/apps/config_schema.py
new file mode 100644
index 00000000..1d0ce172
--- /dev/null
+++ b/apps/config_schema.py
@@ -0,0 +1,69 @@
+"""Schema constants for the EVA config editor.
+
+Variable metadata (widget types, options, ranges, tooltips, conditions) is
+now encoded directly in .env.example using annotation prefixes (#i, #d, #e,
+#r, #g, #x, #v). This module retains only things that are inherently
+editor-behaviour rather than file-structure:
+
+- Tab group name constants and ordering.
+- Mutex radio-button definitions (pipeline mode, perturbation mode).
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+
+GROUP_API_CONFIGS = "API Configs"
+GROUP_VOICE_PIPELINE = "Voice Pipeline"
+GROUP_DEPLOYMENTS = "LiteLLM Deployments"
+GROUP_RUNTIME = "Framework & Runtime"
+GROUP_TURN = "Turn Detection & VAD"
+GROUP_PERTURBATIONS = "User Config"
+GROUP_DEBUG = "Debug & Logging"
+GROUP_MISC = "Misc / Unmapped"
+
+GROUPS: list[str] = [
+ GROUP_API_CONFIGS,
+ GROUP_VOICE_PIPELINE,
+ GROUP_DEPLOYMENTS,
+ GROUP_RUNTIME,
+ GROUP_TURN,
+ GROUP_PERTURBATIONS,
+ GROUP_DEBUG,
+]
+
+
+@dataclass
+class MutexRadio:
+ """A UI radio button that enforces mutual exclusion among a set of vars."""
+
+ state_key: str # st.session_state key managed by this radio
+ group: str # which tab renders this radio
+ label: str
+ options: list[str]
+ help: str = ""
+ default: str = field(default="")
+
+ def __post_init__(self) -> None:
+ if not self.default and self.options:
+ self.default = self.options[0]
+
+
+MUTEX_RADIOS: list[MutexRadio] = [
+ MutexRadio(
+ state_key="pipeline_mode",
+ group=GROUP_VOICE_PIPELINE,
+ label="Pipeline mode",
+ options=["LLM", "S2S", "AudioLLM"],
+ help="LLM = STT+LLM+TTS. S2S = speech-to-speech model. AudioLLM = audio-input LLM + TTS.",
+ default="LLM",
+ ),
+ MutexRadio(
+ state_key="perturbation_mode",
+ group=GROUP_PERTURBATIONS,
+ label="Perturbation persona",
+ options=["None", "Accent", "Behavior"],
+ help="Accent and Behavior are mutually exclusive (each claims the agent ID slot).",
+ default="None",
+ ),
+]
diff --git a/tests/unit/test_config_editor_integration.py b/tests/unit/test_config_editor_integration.py
new file mode 100644
index 00000000..08968efe
--- /dev/null
+++ b/tests/unit/test_config_editor_integration.py
@@ -0,0 +1,107 @@
+"""End-to-end check.
+
+A config built via the editor's serializer must construct a valid RunConfig
+for each pipeline mode.
+"""
+
+from __future__ import annotations
+
+import os
+from pathlib import Path
+from unittest.mock import patch
+
+from apps.config_io import compute_disabled, parse_env_example, serialize_env
+from eva.models.config import RunConfig
+
+REPO_ROOT = Path(__file__).resolve().parents[2]
+
+_MODEL_LIST = [
+ {
+ "model_name": "gpt-5.2",
+ "litellm_params": {"model": "openai/gpt-5.2", "api_key": "sk-test", "max_parallel_requests": 5},
+ "model_info": {"base_model": "gpt-5.2"},
+ },
+ {
+ "model_name": "gemini-3.1-pro-preview",
+ "litellm_params": {
+ "model": "vertex_ai/gemini-3.1-pro-preview",
+ "vertex_project": "p",
+ "vertex_location": "global",
+ "vertex_credentials": "/tmp/x.json",
+ "max_parallel_requests": 5,
+ },
+ },
+ {
+ "model_name": "us.anthropic.claude-opus-4-6",
+ "litellm_params": {
+ "model": "bedrock/us.anthropic.claude-opus-4-6-v1",
+ "aws_access_key_id": "k",
+ "aws_secret_access_key": "s",
+ "max_parallel_requests": 5,
+ },
+ },
+]
+
+
+def _serialize(values: dict, parsed, pipeline_mode: str = "LLM", perturbation_mode: str = "None") -> str:
+ disabled = compute_disabled(parsed, pipeline_mode=pipeline_mode, perturbation_mode=perturbation_mode)
+ return serialize_env(values, parsed, disabled=disabled)
+
+
+def _load_isolated(env_file: Path) -> RunConfig:
+ with patch.dict(os.environ, {"PATH": os.environ["PATH"]}, clear=True):
+ return RunConfig(_env_file=env_file, _cli_parse_args=False)
+
+
+def test_llm_pipeline_serialization_constructs_runconfig(tmp_path: Path) -> None:
+ parsed = parse_env_example(REPO_ROOT / ".env.example")
+ values = {
+ "EVA_MODEL_LIST": _MODEL_LIST,
+ "EVA_MODEL__LLM": "gpt-5.2",
+ "EVA_MODEL__STT": "deepgram",
+ "EVA_MODEL__TTS": "cartesia",
+ "EVA_MODEL__STT_PARAMS": {"api_key": "k", "model": "nova-2"},
+ "EVA_MODEL__TTS_PARAMS": {"api_key": "k", "model": "sonic"},
+ "EVA_DOMAIN": "airline",
+ }
+ env_file = tmp_path / ".env"
+ env_file.write_text(_serialize(values, parsed))
+ config = _load_isolated(env_file)
+ assert config.model.llm == "gpt-5.2"
+ assert config.model.stt == "deepgram"
+ assert config.model.tts == "cartesia"
+ assert config.domain == "airline"
+
+
+def test_s2s_pipeline_serialization_constructs_runconfig(tmp_path: Path) -> None:
+ parsed = parse_env_example(REPO_ROOT / ".env.example")
+ values = {
+ "EVA_MODEL_LIST": _MODEL_LIST,
+ "EVA_MODEL__S2S": "gpt-realtime-mini",
+ "EVA_MODEL__S2S_PARAMS": {"api_key": "k", "model": "gpt-realtime-mini"},
+ "EVA_DOMAIN": "airline",
+ }
+ env_file = tmp_path / ".env"
+ env_file.write_text(_serialize(values, parsed, pipeline_mode="S2S"))
+ config = _load_isolated(env_file)
+ assert config.model.s2s == "gpt-realtime-mini"
+
+
+def test_perturbation_accent_serialization_constructs_runconfig(tmp_path: Path) -> None:
+ parsed = parse_env_example(REPO_ROOT / ".env.example")
+ values = {
+ "EVA_MODEL_LIST": _MODEL_LIST,
+ "EVA_MODEL__LLM": "gpt-5.2",
+ "EVA_MODEL__STT": "deepgram",
+ "EVA_MODEL__TTS": "cartesia",
+ "EVA_MODEL__STT_PARAMS": {"api_key": "k", "model": "nova-2"},
+ "EVA_MODEL__TTS_PARAMS": {"api_key": "k", "model": "sonic"},
+ "EVA_DOMAIN": "airline",
+ "EVA_PERTURBATION__ACCENT": "french",
+ }
+ env_file = tmp_path / ".env"
+ env_file.write_text(_serialize(values, parsed, perturbation_mode="Accent"))
+ config = _load_isolated(env_file)
+ assert config.perturbation is not None
+ assert config.perturbation.accent == "french"
+ assert config.perturbation.behavior is None
diff --git a/tests/unit/test_config_io.py b/tests/unit/test_config_io.py
new file mode 100644
index 00000000..77d0baed
--- /dev/null
+++ b/tests/unit/test_config_io.py
@@ -0,0 +1,156 @@
+"""Unit tests for apps/config_io.py (annotation-aware env parser/serializer)."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+from apps.config_io import load_env, parse_env_example, serialize_env
+from apps.config_schema import GROUP_MISC
+
+ENV_EXAMPLE = Path(__file__).resolve().parents[2] / ".env.example"
+
+
+def test_parses_active_var() -> None:
+ parsed = parse_env_example(ENV_EXAMPLE)
+ spec = parsed.by_name["ELEVENLABS_API_KEY"]
+ assert spec.is_active is True
+ assert spec.widget == "secret"
+ assert "ElevenLabs" in spec.info
+
+
+def test_parses_inactive_var() -> None:
+ parsed = parse_env_example(ENV_EXAMPLE)
+ spec = parsed.by_name["EVA_DOMAIN"]
+ assert spec.is_active is False
+ assert spec.widget == "enum"
+ assert "airline" in spec.options
+
+
+def test_parses_enum_options() -> None:
+ parsed = parse_env_example(ENV_EXAMPLE)
+ spec = parsed.by_name["EVA_MODEL__STT"]
+ assert "deepgram" in spec.options
+ assert "cartesia" in spec.options
+
+
+def test_parses_range() -> None:
+ parsed = parse_env_example(ENV_EXAMPLE)
+ spec = parsed.by_name["EVA_MAX_CONCURRENT_CONVERSATIONS"]
+ assert spec.range is not None
+ assert spec.range[0] == 1.0
+ assert spec.range[1] == 100.0
+
+
+def test_parses_condition() -> None:
+ parsed = parse_env_example(ENV_EXAMPLE)
+ spec = parsed.by_name["EVA_MODEL__STT"]
+ assert ("pipeline_mode", "LLM") in spec.conditions
+
+
+def test_parses_multi_condition() -> None:
+ parsed = parse_env_example(ENV_EXAMPLE)
+ spec = parsed.by_name["EVA_FRENCH_ACCENT_USER_F"]
+ assert ("perturbation_mode", "Accent") in spec.conditions
+ assert ("EVA_PERTURBATION__ACCENT", "french") in spec.conditions
+
+
+def test_group_from_section_header() -> None:
+ parsed = parse_env_example(ENV_EXAMPLE)
+ assert parsed.by_name["ELEVENLABS_API_KEY"].group == "API Configs"
+ assert parsed.by_name["EVA_MODEL__LLM"].group == "LiteLLM Deployments"
+
+
+def test_dedupes_repeated_names() -> None:
+ parsed = parse_env_example(ENV_EXAMPLE)
+ occurrences = [v for v in parsed.vars if v.name == "EVA_METRICS"]
+ assert len(occurrences) == 1
+
+
+def test_multiline_deployment_list() -> None:
+ parsed = parse_env_example(ENV_EXAMPLE)
+ spec = parsed.by_name["EVA_MODEL_LIST"]
+ assert spec.widget == "json_deployment_list"
+ assert spec.line_end > spec.line_start
+
+
+def test_serialize_with_no_values_is_byte_identical_to_example() -> None:
+ parsed = parse_env_example(ENV_EXAMPLE)
+ rendered = serialize_env({}, parsed)
+ original = ENV_EXAMPLE.read_text()
+ if not original.endswith("\n"):
+ original += "\n"
+ assert rendered == original
+
+
+def test_serialize_overrides_active_var(tmp_path: Path) -> None:
+ parsed = parse_env_example(ENV_EXAMPLE)
+ rendered = serialize_env({"OPENAI_API_KEY": "sk-test-123"}, parsed)
+ assert "OPENAI_API_KEY=sk-test-123" in rendered
+ assert "your_openai_api_key_here" not in rendered
+
+
+def test_serialize_activates_inactive_var() -> None:
+ parsed = parse_env_example(ENV_EXAMPLE)
+ rendered = serialize_env({"EVA_DOMAIN": "airline"}, parsed)
+ lines = rendered.splitlines()
+ assert "EVA_DOMAIN=airline" in lines
+ assert "#v EVA_DOMAIN=airline" not in lines
+
+
+def test_serialize_json_blob_single_quoted() -> None:
+ parsed = parse_env_example(ENV_EXAMPLE)
+ deployments = [{"model_name": "x", "litellm_params": {"model": "openai/x"}}]
+ rendered = serialize_env({"EVA_MODEL_LIST": deployments}, parsed)
+ assert "EVA_MODEL_LIST='" in rendered
+
+
+def test_serialize_bool_lowercases() -> None:
+ parsed = parse_env_example(ENV_EXAMPLE)
+ rendered = serialize_env({"EVA_DEBUG": True}, parsed)
+ assert "EVA_DEBUG=true" in rendered
+
+
+def test_serialize_appends_misc_section_for_unknown_vars() -> None:
+ parsed = parse_env_example(ENV_EXAMPLE)
+ rendered = serialize_env({"EVA_TOTALLY_NEW_VAR": "hello"}, parsed)
+ assert GROUP_MISC in rendered
+ assert "EVA_TOTALLY_NEW_VAR=hello" in rendered
+
+
+def test_serialize_disabled_var_uses_current_value() -> None:
+ parsed = parse_env_example(ENV_EXAMPLE)
+ rendered = serialize_env(
+ {"EVA_MODEL__STT": "deepgram"},
+ parsed,
+ disabled={"EVA_MODEL__STT"},
+ )
+ assert "#v EVA_MODEL__STT=deepgram" in rendered
+
+
+def test_load_env_reads_existing_file(tmp_path: Path) -> None:
+ p = tmp_path / ".env"
+ p.write_text("FOO=bar\n#v COMMENTED=skipme\nQUOTED='hello world'\nJSON='[{\"a\": 1}]'\n")
+ out = load_env(p)
+ assert out == {"FOO": "bar", "QUOTED": "hello world", "JSON": '[{"a": 1}]'}
+
+
+def test_load_env_missing_file_returns_empty(tmp_path: Path) -> None:
+ assert load_env(tmp_path / "does-not-exist") == {}
+
+
+def test_round_trip_through_load_env(tmp_path: Path) -> None:
+ parsed = parse_env_example(ENV_EXAMPLE)
+ written = serialize_env(
+ {
+ "OPENAI_API_KEY": "sk-abc",
+ "EVA_DEBUG": True,
+ "EVA_MAX_CONCURRENT_CONVERSATIONS": 8,
+ },
+ parsed,
+ )
+ p = tmp_path / ".env"
+ p.write_text(written)
+ loaded = load_env(p)
+ assert loaded["OPENAI_API_KEY"] == "sk-abc"
+ assert loaded["EVA_DEBUG"] == "true"
+ assert loaded["EVA_MAX_CONCURRENT_CONVERSATIONS"] == "8"