diff --git a/platformd/checkpoint/service.go b/platformd/checkpoint/service.go index 2273b69c..0a932acd 100644 --- a/platformd/checkpoint/service.go +++ b/platformd/checkpoint/service.go @@ -278,16 +278,11 @@ func (s *ServiceImpl) checkpoint(ctx context.Context, id string, baseRef name.Re return fmt.Errorf("kill sockets: %w", err) } - // immediately checkpointing after canceling the attach stream in waitContainerReady - // leads to this error consistently appearing: + // wait a little after closing all sockets, as it seems that if we directly checkpoint we still get: // // Error (criu/sk-inet.c:191): inet: Connected TCP socket, consider using --tcp-established option. // - // the problem is, that there seems to be an open tcp connection which blocks criu - // from checkpointing. i don't know where this is coming from, my workaround for - // now is to just wait a bit. - // FIXME: possible solution could be to cut internet access after the server has - // initialized. + // linux might take some time to close all sockets eventually. time.Sleep(s.cfg.WaitAfterServerInit) logger.InfoContext(ctx, "checkpointing container", "container_id", ctrID) diff --git a/platformd/server.go b/platformd/server.go index aa9b1655..082497de 100644 --- a/platformd/server.go +++ b/platformd/server.go @@ -293,8 +293,14 @@ func (s *Server) Run(ctx context.Context, cfg Config) error { ContainerPath: "/etc/envoy/config.yaml", }, { - HostPath: cfg.ManagementServerListenSock.Path, - ContainerPath: cfg.ManagementServerListenSock.Path, + // mount the parent dir of the socket instead of the socket + // path directly, because if platformd restarts the socket + // mount will go stale as the socket is closed. this causes + // all clients requests to fail until we restart the container. + // the parent dir will still be present, if platformd is stopped. + // once it's back up clients can access the socket again. + HostPath: filepath.Dir(cfg.ManagementServerListenSock.Path), + ContainerPath: filepath.Dir(cfg.ManagementServerListenSock.Path), }, }, Linux: &runtimev1.LinuxContainerConfig{ diff --git a/platformd/workload/service.go b/platformd/workload/service.go index 6615eeac..67a301a8 100644 --- a/platformd/workload/service.go +++ b/platformd/workload/service.go @@ -174,8 +174,14 @@ func (s *svc) RunWorkload(ctx context.Context, w Workload, attempt uint) error { LogPath: "servermon.slog", Mounts: []*runtimev1.Mount{ { - HostPath: s.cfg.PlatformdListenSockURL.Path, - ContainerPath: s.cfg.PlatformdListenSockURL.Path, + // mount the parent dir of the socket instead of the socket + // path directly, because if platformd restarts the socket + // mount will go stale as the socket is closed. this causes + // all clients requests to fail until we restart the container. + // the parent dir will still be present, if platformd is stopped. + // once it's back up clients can access the socket again. + HostPath: filepath.Dir(s.cfg.PlatformdListenSockURL.Path), + ContainerPath: filepath.Dir(s.cfg.PlatformdListenSockURL.Path), }, }, Linux: &runtimev1.LinuxContainerConfig{ diff --git a/platformd/workload/service_test.go b/platformd/workload/service_test.go index d37a1886..984cb260 100644 --- a/platformd/workload/service_test.go +++ b/platformd/workload/service_test.go @@ -23,6 +23,7 @@ import ( "fmt" "log/slog" "os" + "path/filepath" "strings" "testing" @@ -144,8 +145,8 @@ func TestRunWorkload(t *testing.T) { LogPath: "servermon.slog", Mounts: []*runtimev1.Mount{ { - HostPath: cfg.PlatformdListenSockURL.Path, - ContainerPath: cfg.PlatformdListenSockURL.Path, + HostPath: filepath.Dir(cfg.PlatformdListenSockURL.Path), + ContainerPath: filepath.Dir(cfg.PlatformdListenSockURL.Path), }, }, Linux: &runtimev1.LinuxContainerConfig{