Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 2 additions & 7 deletions platformd/checkpoint/service.go
Original file line number Diff line number Diff line change
Expand Up @@ -278,16 +278,11 @@ func (s *ServiceImpl) checkpoint(ctx context.Context, id string, baseRef name.Re
return fmt.Errorf("kill sockets: %w", err)
}

// immediately checkpointing after canceling the attach stream in waitContainerReady
// leads to this error consistently appearing:
// wait a little after closing all sockets, as it seems that if we directly checkpoint we still get:
//
// Error (criu/sk-inet.c:191): inet: Connected TCP socket, consider using --tcp-established option.
//
// the problem is, that there seems to be an open tcp connection which blocks criu
// from checkpointing. i don't know where this is coming from, my workaround for
// now is to just wait a bit.
// FIXME: possible solution could be to cut internet access after the server has
// initialized.
// linux might take some time to close all sockets eventually.
time.Sleep(s.cfg.WaitAfterServerInit)

logger.InfoContext(ctx, "checkpointing container", "container_id", ctrID)
Expand Down
10 changes: 8 additions & 2 deletions platformd/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -293,8 +293,14 @@ func (s *Server) Run(ctx context.Context, cfg Config) error {
ContainerPath: "/etc/envoy/config.yaml",
},
{
HostPath: cfg.ManagementServerListenSock.Path,
ContainerPath: cfg.ManagementServerListenSock.Path,
// mount the parent dir of the socket instead of the socket
// path directly, because if platformd restarts the socket
// mount will go stale as the socket is closed. this causes
// all clients requests to fail until we restart the container.
// the parent dir will still be present, if platformd is stopped.
// once it's back up clients can access the socket again.
HostPath: filepath.Dir(cfg.ManagementServerListenSock.Path),
ContainerPath: filepath.Dir(cfg.ManagementServerListenSock.Path),
},
},
Linux: &runtimev1.LinuxContainerConfig{
Expand Down
10 changes: 8 additions & 2 deletions platformd/workload/service.go
Original file line number Diff line number Diff line change
Expand Up @@ -174,8 +174,14 @@ func (s *svc) RunWorkload(ctx context.Context, w Workload, attempt uint) error {
LogPath: "servermon.slog",
Mounts: []*runtimev1.Mount{
{
HostPath: s.cfg.PlatformdListenSockURL.Path,
ContainerPath: s.cfg.PlatformdListenSockURL.Path,
// mount the parent dir of the socket instead of the socket
// path directly, because if platformd restarts the socket
// mount will go stale as the socket is closed. this causes
// all clients requests to fail until we restart the container.
// the parent dir will still be present, if platformd is stopped.
// once it's back up clients can access the socket again.
HostPath: filepath.Dir(s.cfg.PlatformdListenSockURL.Path),
ContainerPath: filepath.Dir(s.cfg.PlatformdListenSockURL.Path),
},
},
Linux: &runtimev1.LinuxContainerConfig{
Expand Down
5 changes: 3 additions & 2 deletions platformd/workload/service_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (
"fmt"
"log/slog"
"os"
"path/filepath"
"strings"
"testing"

Expand Down Expand Up @@ -144,8 +145,8 @@ func TestRunWorkload(t *testing.T) {
LogPath: "servermon.slog",
Mounts: []*runtimev1.Mount{
{
HostPath: cfg.PlatformdListenSockURL.Path,
ContainerPath: cfg.PlatformdListenSockURL.Path,
HostPath: filepath.Dir(cfg.PlatformdListenSockURL.Path),
ContainerPath: filepath.Dir(cfg.PlatformdListenSockURL.Path),
},
},
Linux: &runtimev1.LinuxContainerConfig{
Expand Down