Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
da3ecfa
Implement cloud-hypervisor fork
sjmiller609 Feb 27, 2026
b6e67f8
Merge remote-tracking branch 'origin/main' into fork-vm
sjmiller609 Feb 27, 2026
5081871
move fork support checks behind hypervisor starter interface
sjmiller609 Feb 27, 2026
566d52b
api: add fork instance endpoint
sjmiller609 Feb 27, 2026
e80afcf
refactor: move CH fork snapshot rewrite into CH package
sjmiller609 Feb 27, 2026
936fe57
fork: support running source via standby-resume flow
sjmiller609 Feb 27, 2026
b47d065
Fix fork restore guest IP reconfiguration
sjmiller609 Feb 27, 2026
addd9db
Add QEMU fork support for running standby flow
sjmiller609 Feb 27, 2026
645bf63
Add fork target state and fix running-fork cleanup
sjmiller609 Feb 28, 2026
0481d35
Stabilize Linux CI test timeouts
sjmiller609 Feb 28, 2026
1dc53e3
Deep-copy metadata refs in fork path
sjmiller609 Feb 28, 2026
d3afe68
Fix fork CID persistence semantics and cleanup validation
sjmiller609 Feb 28, 2026
fdf6652
Merge origin/main and add firecracker fork support
sjmiller609 Mar 2, 2026
75ce151
Stabilize firecracker fork integration test assertions
sjmiller609 Mar 2, 2026
4fc57c1
Harden fork rewrite safety and volume fork validation
sjmiller609 Mar 2, 2026
321fbfd
Add Firecracker standby fork support
sjmiller609 Mar 2, 2026
3a94ba6
Address remaining fork review findings
sjmiller609 Mar 2, 2026
1be08cb
Add firecracker fork support
sjmiller609 Mar 2, 2026
a2ad083
Use running Firecracker fork test and gate on guest-agent readiness
sjmiller609 Mar 2, 2026
5189e4e
Fail running firecracker fork when guest agent is not ready
sjmiller609 Mar 2, 2026
834f2d4
Serialize firecracker snapshot source aliasing during restore
sjmiller609 Mar 2, 2026
9f362af
Fix fork name collisions and running-source restore ordering
sjmiller609 Mar 2, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ jobs:
CLOUDFLARE_API_TOKEN: ${{ secrets.CLOUDFLARE_API_TOKEN }}
TLS_TEST_DOMAIN: "test.hypeman-development.com"
TLS_ALLOWED_DOMAINS: '*.hypeman-development.com'
run: make test
run: make test TEST_TIMEOUT=20m

test-darwin:
runs-on: [self-hosted, macos, arm64]
Expand Down
68 changes: 68 additions & 0 deletions cmd/api/api/instances.go
Original file line number Diff line number Diff line change
Expand Up @@ -447,6 +447,74 @@ func (s *ApiService) RestoreInstance(ctx context.Context, request oapi.RestoreIn
return oapi.RestoreInstance200JSONResponse(instanceToOAPI(*result)), nil
}

// ForkInstance forks an instance from stopped or standby into a new instance.
// The id parameter can be an instance ID, name, or ID prefix.
// Note: Resolution is handled by ResolveResource middleware.
func (s *ApiService) ForkInstance(ctx context.Context, request oapi.ForkInstanceRequestObject) (oapi.ForkInstanceResponseObject, error) {
inst := mw.GetResolvedInstance[instances.Instance](ctx)
if inst == nil {
return oapi.ForkInstance500JSONResponse{
Code: "internal_error",
Message: "resource not resolved",
}, nil
}
log := logger.FromContext(ctx)

if request.Body == nil {
return oapi.ForkInstance400JSONResponse{
Code: "invalid_request",
Message: "request body is required",
}, nil
}

targetState := instances.State("")
if request.Body.TargetState != nil {
targetState = instances.State(*request.Body.TargetState)
}

result, err := s.InstanceManager.ForkInstance(ctx, inst.Id, instances.ForkInstanceRequest{
Name: request.Body.Name,
FromRunning: request.Body.FromRunning != nil && *request.Body.FromRunning,
TargetState: targetState,
})
if err != nil {
switch {
case errors.Is(err, instances.ErrNotFound):
return oapi.ForkInstance404JSONResponse{
Code: "not_found",
Message: "instance not found",
}, nil
case errors.Is(err, instances.ErrInvalidState):
return oapi.ForkInstance409JSONResponse{
Code: "invalid_state",
Message: err.Error(),
}, nil
case errors.Is(err, instances.ErrInvalidRequest):
return oapi.ForkInstance400JSONResponse{
Code: "invalid_request",
Message: err.Error(),
}, nil
case errors.Is(err, instances.ErrAlreadyExists), errors.Is(err, network.ErrNameExists):
return oapi.ForkInstance409JSONResponse{
Code: "name_conflict",
Message: err.Error(),
}, nil
case errors.Is(err, instances.ErrNotSupported):
return oapi.ForkInstance501JSONResponse{
Code: "not_supported",
Message: err.Error(),
}, nil
default:
log.ErrorContext(ctx, "failed to fork instance", "error", err)
return oapi.ForkInstance500JSONResponse{
Code: "internal_error",
Message: "failed to fork instance",
}, nil
}
}
return oapi.ForkInstance201JSONResponse(instanceToOAPI(*result)), nil
}

// StopInstance gracefully stops a running instance
// The id parameter can be an instance ID, name, or ID prefix
// Note: Resolution is handled by ResolveResource middleware
Expand Down
199 changes: 199 additions & 0 deletions cmd/api/api/instances_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,15 @@ package api

import (
"context"
"fmt"
"os"
"testing"
"time"

"github.com/c2h5oh/datasize"
"github.com/kernel/hypeman/lib/hypervisor"
"github.com/kernel/hypeman/lib/instances"
mw "github.com/kernel/hypeman/lib/middleware"
"github.com/kernel/hypeman/lib/oapi"
"github.com/kernel/hypeman/lib/paths"
"github.com/kernel/hypeman/lib/system"
Expand Down Expand Up @@ -137,6 +139,24 @@ type captureCreateManager struct {
lastReq *instances.CreateInstanceRequest
}

type captureForkManager struct {
instances.Manager
lastID string
lastReq *instances.ForkInstanceRequest
result *instances.Instance
err error
}

func (m *captureForkManager) ForkInstance(ctx context.Context, id string, req instances.ForkInstanceRequest) (*instances.Instance, error) {
reqCopy := req
m.lastID = id
m.lastReq = &reqCopy
if m.err != nil {
return nil, m.err
}
return m.result, nil
}

func (m *captureCreateManager) CreateInstance(ctx context.Context, req instances.CreateInstanceRequest) (*instances.Instance, error) {
reqCopy := req
m.lastReq = &reqCopy
Expand Down Expand Up @@ -190,6 +210,185 @@ func TestCreateInstance_OmittedHotplugSizeDefaultsToZero(t *testing.T) {
assert.Equal(t, int64(0), int64(hotplugBytes), "response should report zero hotplug_size when omitted")
}

func TestForkInstance_Success(t *testing.T) {
svc := newTestService(t)

now := time.Now()
source := instances.Instance{
StoredMetadata: instances.StoredMetadata{
Id: "src-instance",
Name: "src-instance",
Image: "docker.io/library/alpine:latest",
CreatedAt: now,
HypervisorType: hypervisor.TypeCloudHypervisor,
},
State: instances.StateStopped,
}

forked := &instances.Instance{
StoredMetadata: instances.StoredMetadata{
Id: "forked-instance",
Name: "forked-instance",
Image: "docker.io/library/alpine:latest",
CreatedAt: now,
HypervisorType: hypervisor.TypeCloudHypervisor,
},
State: instances.StateStopped,
}

mockMgr := &captureForkManager{
Manager: svc.InstanceManager,
result: forked,
}
svc.InstanceManager = mockMgr

resp, err := svc.ForkInstance(
mw.WithResolvedInstance(ctx(), source.Id, source),
oapi.ForkInstanceRequestObject{
Id: source.Id,
Body: &oapi.ForkInstanceRequest{
Name: "forked-instance",
},
},
)
require.NoError(t, err)

created, ok := resp.(oapi.ForkInstance201JSONResponse)
require.True(t, ok, "expected 201 response")
assert.Equal(t, "forked-instance", created.Name)
assert.Equal(t, source.Id, mockMgr.lastID)
require.NotNil(t, mockMgr.lastReq)
assert.Equal(t, "forked-instance", mockMgr.lastReq.Name)
assert.False(t, mockMgr.lastReq.FromRunning)
assert.Equal(t, instances.State(""), mockMgr.lastReq.TargetState)
}

func TestForkInstance_NotSupported(t *testing.T) {
svc := newTestService(t)

source := instances.Instance{
StoredMetadata: instances.StoredMetadata{
Id: "src-instance",
Name: "src-instance",
Image: "docker.io/library/alpine:latest",
CreatedAt: time.Now(),
HypervisorType: hypervisor.TypeQEMU,
},
State: instances.StateStopped,
}

mockMgr := &captureForkManager{
Manager: svc.InstanceManager,
err: instances.ErrNotSupported,
}
svc.InstanceManager = mockMgr

resp, err := svc.ForkInstance(
mw.WithResolvedInstance(ctx(), source.Id, source),
oapi.ForkInstanceRequestObject{
Id: source.Id,
Body: &oapi.ForkInstanceRequest{
Name: "forked-instance",
},
},
)
require.NoError(t, err)

notSupported, ok := resp.(oapi.ForkInstance501JSONResponse)
require.True(t, ok, "expected 501 response")
assert.Equal(t, "not_supported", notSupported.Code)
}

func TestForkInstance_InvalidRequest(t *testing.T) {
svc := newTestService(t)

source := instances.Instance{
StoredMetadata: instances.StoredMetadata{
Id: "src-instance",
Name: "src-instance",
Image: "docker.io/library/alpine:latest",
CreatedAt: time.Now(),
HypervisorType: hypervisor.TypeCloudHypervisor,
},
State: instances.StateStopped,
}

mockMgr := &captureForkManager{
Manager: svc.InstanceManager,
err: fmt.Errorf("%w: name is required", instances.ErrInvalidRequest),
}
svc.InstanceManager = mockMgr

resp, err := svc.ForkInstance(
mw.WithResolvedInstance(ctx(), source.Id, source),
oapi.ForkInstanceRequestObject{
Id: source.Id,
Body: &oapi.ForkInstanceRequest{
Name: "",
},
},
)
require.NoError(t, err)

badReq, ok := resp.(oapi.ForkInstance400JSONResponse)
require.True(t, ok, "expected 400 response")
assert.Equal(t, "invalid_request", badReq.Code)
}

func TestForkInstance_FromRunningFlagForwarded(t *testing.T) {
svc := newTestService(t)

now := time.Now()
source := instances.Instance{
StoredMetadata: instances.StoredMetadata{
Id: "src-instance",
Name: "src-instance",
Image: "docker.io/library/alpine:latest",
CreatedAt: now,
HypervisorType: hypervisor.TypeCloudHypervisor,
},
State: instances.StateRunning,
}

forked := &instances.Instance{
StoredMetadata: instances.StoredMetadata{
Id: "forked-instance",
Name: "forked-instance",
Image: "docker.io/library/alpine:latest",
CreatedAt: now,
HypervisorType: hypervisor.TypeCloudHypervisor,
},
State: instances.StateStandby,
}

mockMgr := &captureForkManager{
Manager: svc.InstanceManager,
result: forked,
}
svc.InstanceManager = mockMgr

fromRunning := true
targetState := oapi.ForkTargetStateRunning
resp, err := svc.ForkInstance(
mw.WithResolvedInstance(ctx(), source.Id, source),
oapi.ForkInstanceRequestObject{
Id: source.Id,
Body: &oapi.ForkInstanceRequest{
Name: "forked-instance",
FromRunning: &fromRunning,
TargetState: &targetState,
},
},
)
require.NoError(t, err)

_, ok := resp.(oapi.ForkInstance201JSONResponse)
require.True(t, ok, "expected 201 response")
require.NotNil(t, mockMgr.lastReq)
assert.True(t, mockMgr.lastReq.FromRunning)
assert.Equal(t, instances.StateRunning, mockMgr.lastReq.TargetState)
}

func TestInstanceLifecycle_StopStart(t *testing.T) {
// Require KVM access for VM creation
if _, err := os.Stat("/dev/kvm"); os.IsNotExist(err) {
Expand Down
4 changes: 4 additions & 0 deletions lib/builds/manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,10 @@ func (m *mockInstanceManager) DeleteInstance(ctx context.Context, id string) err
return nil
}

func (m *mockInstanceManager) ForkInstance(ctx context.Context, id string, req instances.ForkInstanceRequest) (*instances.Instance, error) {
return nil, instances.ErrNotFound
}

func (m *mockInstanceManager) StandbyInstance(ctx context.Context, id string) (*instances.Instance, error) {
return nil, nil
}
Expand Down
64 changes: 64 additions & 0 deletions lib/forkvm/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
# VM Forking: Hypervisor Behavior

This document describes hypervisor-specific fork behavior and how fork is made
to work across implementations.

## Common fork model

- **Stopped source**: clone VM data and start a new VM from copied state.
- **Standby source**: clone data + snapshot artifacts, then adapt snapshot
identity for the fork (paths, network, vsock behavior varies by hypervisor).
- **Running source**: transition source to standby, fork from that standby
snapshot, then restore the source.

For networked forks, the fork gets a fresh host/guest identity (IP, MAC, TAP)
instead of reusing the source identity.

## Cloud Hypervisor

- Snapshot-based forks are supported by rewriting snapshot configuration before
restore.
- Path rewrites are constrained to exact source-directory matches or source-dir
path prefixes to avoid mutating unrelated values.
- Serial log path, vsock socket path, and network fields are updated for the
fork.
- Vsock CID is intentionally kept stable for snapshot restore compatibility.
- Running-source fork works by standby -> fork -> restore source, with source
and fork separated by rewritten runtime endpoints.

## QEMU

- Snapshot-based forks are supported by rewriting QEMU snapshot VM config.
- Rewrites are explicit and path-safe (source-dir exact/prefix replacement),
applied to disk/kernel/initrd/serial/vsock socket paths.
- Kernel arguments are left unchanged (not blanket-rewritten), to avoid
accidental mutation of non-path text.
- Network identity is updated in snapshot config for the fork.
- Vsock CID updates are supported for snapshot state, so running-source fork can
rotate source CID when needed to avoid CID collision after restore.

## Firecracker

- Firecracker snapshot restore supports **network overrides** but does not
expose a full snapshot-config rewrite surface for arbitrary embedded paths.
- To make standby/running fork work, fork preparation stores desired network
override data and source->target data-directory mapping.
- During restore, the source data path is temporarily aliased to the fork data
path so embedded snapshot paths resolve for the fork, then aliasing is
cleaned up.
- Network override fields are supplied at snapshot load to bind the fork to its
own TAP device.
- Vsock CID remains stable for snapshot-based flows.

## VZ (Virtualization.framework)

- Fork is not supported.
- Snapshot restore for Linux guests is not available in this mode, so standby
snapshot-based fork mechanics cannot be implemented.

## Operational constraints

- Writable attached volumes are rejected for fork to prevent concurrent
cross-VM writes to the same backing data.
- If a post-fork target-state transition fails, the partially created fork is
cleaned up rather than left orphaned.
Loading