Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 11 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,8 @@ puma rm inftyai/tiny-random-gpt2
### API Server

```bash
# Start the inference server
puma serve
# Start the inference server with a model
puma serve inftyai/tiny-random-gpt2

# Server will start on http://0.0.0.0:8000
# API endpoints:
Expand Down Expand Up @@ -109,7 +109,7 @@ curl http://localhost:8000/v1/chat/completions \
| `rm <model>` | ✅ | Remove model and cache |
| `info` | ✅ | Display system information |
| `version` | ✅ | Show PUMA version |
| `serve` | ✅ | Start OpenAI-compatible API server |
| `serve <model>` | ✅ | Start OpenAI-compatible API server with a model |
| `ps` | 🚧 | List running models |
| `run` | 🚧 | Start model inference |
| `stop` | 🚧 | Stop running model |
Expand Down Expand Up @@ -151,11 +151,14 @@ PUMA provides an OpenAI-compatible API server for model inference.
### Starting the Server

```bash
# Default: 0.0.0.0:8000
puma serve
# Start server with a model (default: 0.0.0.0:8000)
puma serve inftyai/tiny-random-gpt2

# Custom host and port
puma serve --host 127.0.0.1 --port 3000
puma serve inftyai/tiny-random-gpt2 --host 127.0.0.1 --port 3000

# Model must be pulled first
puma pull inftyai/tiny-random-gpt2
```

### API Endpoints
Expand Down Expand Up @@ -188,13 +191,14 @@ curl http://localhost:8000/v1/chat/completions \

#### List Models
```bash
# Returns the currently loaded model
curl http://localhost:8000/v1/models
```

#### Health Check
```bash
curl http://localhost:8000/health
# Returns: {"status":"ok","version":"0.0.2"}
# Returns: {"status":"ok"}
```

### OpenAI Python Client
Expand Down
2 changes: 0 additions & 2 deletions src/api/routes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -60,13 +60,11 @@ pub fn create_router<E: InferenceEngine + Clone + 'static>(
#[derive(Serialize)]
struct HealthResponse {
status: String,
version: String,
}

/// Health check endpoint
async fn health_check() -> Json<HealthResponse> {
Json(HealthResponse {
status: "ok".to_string(),
version: env!("CARGO_PKG_VERSION").to_string(),
})
}
1 change: 0 additions & 1 deletion src/api/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,6 @@ async fn test_health_check() {

assert_eq!(status, StatusCode::OK);
assert_eq!(json["status"], "ok");
assert!(json["version"].is_string());
}

#[tokio::test]
Expand Down
76 changes: 75 additions & 1 deletion src/cli/commands.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,9 @@ enum Commands {

#[derive(Parser)]
struct ServeArgs {
/// Model name to serve (e.g., inftyai/tiny-random-gpt2)
model: String,

/// Host address to bind to
#[arg(long, default_value = "0.0.0.0")]
host: String,
Expand Down Expand Up @@ -221,7 +224,24 @@ pub async fn run(cli: Cli) {
}

Commands::SERVE(args) => {
if let Err(e) = crate::cli::serve::execute(&args.host, args.port).await {
// Verify model exists
let registry = ModelRegistry::new(None);
match registry.get_model(&args.model) {
Ok(Some(_)) => {
// Model exists, proceed
}
Ok(None) => {
eprintln!("❌ Error: Model '{}' not found in registry", args.model);
eprintln!("Run 'puma pull {}' to download it first", args.model);
std::process::exit(1);
}
Err(e) => {
eprintln!("❌ Error checking model: {}", e);
std::process::exit(1);
}
}

Comment on lines +227 to +243
if let Err(e) = crate::cli::serve::execute(&args.host, args.port, &args.model).await {
eprintln!("Error starting server: {}", e);
std::process::exit(1);
}
Expand Down Expand Up @@ -392,4 +412,58 @@ mod tests {
assert_eq!(result.metadata.cache.revision, "v2");
assert_eq!(result.metadata.cache.size, 2000);
}

#[test]
fn test_serve_with_existing_model() {
let temp_dir = TempDir::new().unwrap();
let registry = ModelRegistry::new(Some(temp_dir.path().to_path_buf()));

let model = create_test_model("test/serve-model", "abc123");
registry.register_model(model).unwrap();

// Verify model exists (this is what serve command checks)
let result = registry.get_model("test/serve-model");
assert!(result.is_ok());
assert!(result.unwrap().is_some());
}

#[test]
fn test_serve_with_nonexistent_model() {
let temp_dir = TempDir::new().unwrap();
let registry = ModelRegistry::new(Some(temp_dir.path().to_path_buf()));

// Verify model doesn't exist
let result = registry.get_model("nonexistent/model");
assert!(result.is_ok());
assert!(result.unwrap().is_none());
}
Comment on lines +416 to +439

#[test]
fn test_serve_args_parsing() {
// Test that ServeArgs requires model argument
use clap::CommandFactory;
let app = Cli::command();

// This should fail without model argument
let result = app.clone().try_get_matches_from(vec!["puma", "serve"]);
assert!(result.is_err());

// This should succeed with model argument
let result = app
.clone()
.try_get_matches_from(vec!["puma", "serve", "test/model"]);
assert!(result.is_ok());

// This should succeed with model and optional args
let result = app.try_get_matches_from(vec![
"puma",
"serve",
"test/model",
"--host",
"127.0.0.1",
"--port",
"9000",
]);
assert!(result.is_ok());
}
}
8 changes: 6 additions & 2 deletions src/cli/serve.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,11 @@ use crate::backend::mock::MockEngine;
use crate::registry::model_registry::ModelRegistry;

/// Execute the serve command
pub async fn execute(host: &str, port: u16) -> Result<(), Box<dyn std::error::Error>> {
pub async fn execute(
host: &str,
port: u16,
model_name: &str,
) -> Result<(), Box<dyn std::error::Error>> {
println!(
"{}",
"
Expand All @@ -23,7 +27,7 @@ pub async fn execute(host: &str, port: u16) -> Result<(), Box<dyn std::error::Er
.bright_blue()
.bold()
);
info!("Starting PUMA inference server");
info!("Starting PUMA to serve model: {}", model_name);

// Initialize backend (MockEngine for now, replace with MLX later)
let engine = Arc::new(MockEngine::new());
Comment on lines 10 to 33
Expand Down
Loading