From 7d15576fbf741dd29aed6b5a700c9a0f5d08db8c Mon Sep 17 00:00:00 2001 From: yannic rieger Date: Sat, 23 May 2026 18:24:37 +0200 Subject: [PATCH 1/2] remove WaitAfterServerInit config option --- Makefile | 2 +- cmd/platformd/main.go | 56 +++++++++++++++----------------- internal/datapath/objects.go | 12 ++++--- internal/datapath/sock_bpfeb.o | Bin 87368 -> 87368 bytes internal/datapath/sock_bpfel.o | Bin 87368 -> 87368 bytes platformd/checkpoint/config.go | 1 - platformd/checkpoint/service.go | 13 ++++---- platformd/config.go | 1 - platformd/server.go | 2 +- 9 files changed, 42 insertions(+), 45 deletions(-) diff --git a/Makefile b/Makefile index 3d3e1ab7..9642b4cb 100644 --- a/Makefile +++ b/Makefile @@ -131,7 +131,7 @@ functests-shared: $(TEST_IMG) .PHONY: ebpf ebpf: $(RUN) $(SUDO) go generate ./internal/datapath - $(RUN) $(SUDO) go test ./internal/datapath -run TestVerifier -count 1 + $(RUN) $(SUDO) go test -tags bpf ./internal/datapath -run TestVerifier -count 1 $(TEST_IMG): @docker build -t test-img -f $(IMG_TESTDATA_DIR)/Dockerfile $(IMG_TESTDATA_DIR) diff --git a/cmd/platformd/main.go b/cmd/platformd/main.go index 8492a7b7..9c4d66dc 100644 --- a/cmd/platformd/main.go +++ b/cmd/platformd/main.go @@ -25,34 +25,33 @@ func main() { mgmtListenSock = fs.String("management-server-listen-sock", "/run/platformd/platformd.sock", "path to the unix domain socket to listen on") //nolint:lll mgmtSockUID = fs.Uint64("management-server-listen-sock-uid", 9012, "unix domain socket uid") mgmtSockGID = fs.Uint64("management-server-listen-sock-gid", 9012, "unix domain socket gid") - criListenSock = fs.String("cri-listen-sock", "/var/run/crio/crio.sock", "path to the unix domain socket the CRI is listening on") //nolint:lll - envoyImage = fs.String("envoy-image", "", "container image to use for envoy") //nolint:lll - coreDNSImage = fs.String("coredns-image", "", "container image to use for CoreDNS") //nolint:lll - getsockoptCgroup = fs.String("getsockopt-cgroup", "", "container image to use for coredns") //nolint:lll - dnsServer = fs.String("dns-server", "", "dns server used by the containers") //nolint:lll - hostIface = fs.String("host-iface", "", "internet-facing network interface for ingress and egress traffic") //nolint:lll - maxAttempts = fs.Uint("max-attempts", 5, "maximum number of attempts workload creation attempts") //nolint:lll - syncInterval = fs.Duration("sync-interval", 200*time.Millisecond, "i") //nolint:lll - nodeID = fs.String("node-id", "", "unique node id") //nolint:lll - minPort = fs.Uint("min-port", 30000, "start of the port range") //nolint:lll - maxPort = fs.Uint("max-port", 40000, "end of the port range") //nolint:lll - workloadNamespace = fs.String("workload-namespace", "", "namespace where the workload is deployed") //nolint:lll - registryEndpoint = fs.String("registry-endpoint", "", "registry endpoint where base images will be pulled from and checkpoints pushed to") //nolint:lll - registryUser = fs.String("registry-user", "", "user for the registry") //nolint:lll - registryPass = fs.String("registry-password", "", "password for the registry") //nolint:lll - controlPlaneEndpoint = fs.String("control-plane-endpoint", "", "control plane endpoint") //nolint:lll - checkCPUPeriod = fs.Uint64("checkpoint-cpu-period", 0, "period of checking CPU period") //nolint:lll - checkCPUQuota = fs.Uint64("checkpoint-cpu-quota", 0, "quota of checking CPU quota") //nolint:lll - checkMemoryLimitInBytes = fs.Uint64("checkpoint-memory-limit-bytes", 0, "memory limit of the container that will be checkpointed") //nolint:lll - checkLocationDir = fs.String("checkpoint-file-dir", "/tmp/platformd", "directory where checkpoint files will be stored") //nolint:lll - checkTimeout = fs.Uint64("checkpoint-timeout-seconds", 60, "timeout for checkpoint creation") //nolint:lll - checkListenAddr = fs.String("checkpoint-listen-addr", "", "timeout for checkpoint creation") //nolint:lll - checkStatusRetentionDuration = fs.Duration("checkpoint-status-retention-period", 1*time.Minute, "timeout for checkpoint creation") //nolint:lll - checkContainerReadyTimeout = fs.Duration("checkpoint-container-ready-timeout", 1*time.Minute, "maximum time to wait until the container is ready for checkpointing") //nolint:lll - checkWaitAfterServerInit = fs.Duration("checkpoint-wait-server-init", 10*time.Second, "how long to wait before performing a checkpoint after server has initialized ") //nolint:lll - mcServerManagementAPIToken = fs.String("mc-server-management-api-token", "", "token to use for the minecraft server management api") //nolint:lll - serverMonImage = fs.String("servermon-image", "", "image to use for the servermon container") //nolint:lll - _ = fs.String("config", "/etc/platformd/config.json", "path to the config file") //nolint:lll + criListenSock = fs.String("cri-listen-sock", "/var/run/crio/crio.sock", "path to the unix domain socket the CRI is listening on") //nolint:lll + envoyImage = fs.String("envoy-image", "", "container image to use for envoy") //nolint:lll + coreDNSImage = fs.String("coredns-image", "", "container image to use for CoreDNS") //nolint:lll + getsockoptCgroup = fs.String("getsockopt-cgroup", "", "container image to use for coredns") //nolint:lll + dnsServer = fs.String("dns-server", "", "dns server used by the containers") //nolint:lll + hostIface = fs.String("host-iface", "", "internet-facing network interface for ingress and egress traffic") //nolint:lll + maxAttempts = fs.Uint("max-attempts", 5, "maximum number of attempts workload creation attempts") //nolint:lll + syncInterval = fs.Duration("sync-interval", 200*time.Millisecond, "i") //nolint:lll + nodeID = fs.String("node-id", "", "unique node id") //nolint:lll + minPort = fs.Uint("min-port", 30000, "start of the port range") //nolint:lll + maxPort = fs.Uint("max-port", 40000, "end of the port range") //nolint:lll + workloadNamespace = fs.String("workload-namespace", "", "namespace where the workload is deployed") //nolint:lll + registryEndpoint = fs.String("registry-endpoint", "", "registry endpoint where base images will be pulled from and checkpoints pushed to") //nolint:lll + registryUser = fs.String("registry-user", "", "user for the registry") //nolint:lll + registryPass = fs.String("registry-password", "", "password for the registry") //nolint:lll + controlPlaneEndpoint = fs.String("control-plane-endpoint", "", "control plane endpoint") //nolint:lll + checkCPUPeriod = fs.Uint64("checkpoint-cpu-period", 0, "period of checking CPU period") //nolint:lll + checkCPUQuota = fs.Uint64("checkpoint-cpu-quota", 0, "quota of checking CPU quota") //nolint:lll + checkMemoryLimitInBytes = fs.Uint64("checkpoint-memory-limit-bytes", 0, "memory limit of the container that will be checkpointed") //nolint:lll + checkLocationDir = fs.String("checkpoint-file-dir", "/tmp/platformd", "directory where checkpoint files will be stored") //nolint:lll + checkTimeout = fs.Uint64("checkpoint-timeout-seconds", 60, "timeout for checkpoint creation") //nolint:lll + checkListenAddr = fs.String("checkpoint-listen-addr", "", "timeout for checkpoint creation") //nolint:lll + checkStatusRetentionDuration = fs.Duration("checkpoint-status-retention-period", 1*time.Minute, "timeout for checkpoint creation") //nolint:lll + checkContainerReadyTimeout = fs.Duration("checkpoint-container-ready-timeout", 1*time.Minute, "maximum time to wait until the container is ready for checkpointing") //nolint:lll + mcServerManagementAPIToken = fs.String("mc-server-management-api-token", "", "token to use for the minecraft server management api") //nolint:lll + serverMonImage = fs.String("servermon-image", "", "image to use for the servermon container") //nolint:lll + _ = fs.String("config", "/etc/platformd/config.json", "path to the config file") //nolint:lll ) if err := ff.Parse(fs, os.Args[1:], ff.WithEnvVarPrefix("PLATFORMD"), @@ -97,7 +96,6 @@ func main() { ListenAddr: *checkListenAddr, StatusRetentionPeriod: *checkStatusRetentionDuration, ContainerReadyTimeout: *checkContainerReadyTimeout, - WaitAfterServerInit: *checkWaitAfterServerInit, }, ManagementSocketUID: *mgmtSockUID, ManagementSocketGID: *mgmtSockGID, diff --git a/internal/datapath/objects.go b/internal/datapath/objects.go index 9194794b..e112833e 100644 --- a/internal/datapath/objects.go +++ b/internal/datapath/objects.go @@ -127,8 +127,9 @@ func (o *Objects) BlockIP4Connections(cgroupPath string) error { Attach: ebpf.AttachCGroupInet4Connect, Path: cgroupPath, }); err != nil { - return err + return fmt.Errorf("attach: %w", err) } + return nil } @@ -138,8 +139,9 @@ func (o *Objects) BlockIP6Connections(cgroupPath string) error { Attach: ebpf.AttachCGroupInet6Connect, Path: cgroupPath, }); err != nil { - return err + return fmt.Errorf("attach: %w", err) } + return nil } @@ -155,7 +157,7 @@ func (o *Objects) AttachAndPinSNAT(iface *net.Interface) error { // pin because cni is short-lived if err := l.Pin(fmt.Sprintf("%s/snat_%s", ProgPinPath, iface.Name)); err != nil { - return fmt.Errorf("pin link: %w", err) + return fmt.Errorf("pin: %w", err) } return nil @@ -177,7 +179,7 @@ func (o *Objects) AttachAndPinDNAT(iface *net.Interface) error { // TODO: update prog return nil } - return fmt.Errorf("pin link: %w", err) + return fmt.Errorf("pin: %w", err) } return nil @@ -195,7 +197,7 @@ func (o *Objects) AttachAndPinARP(iface *net.Interface) error { // pin because cni is short-lived if err := l.Pin(fmt.Sprintf("%s/arp_%s", ProgPinPath, iface.Name)); err != nil { - return fmt.Errorf("pin link: %w", err) + return fmt.Errorf("pin: %w", err) } return nil diff --git a/internal/datapath/sock_bpfeb.o b/internal/datapath/sock_bpfeb.o index 10843cc07bc4f1f39a7c4d9208ba5646c8e251e6..f309b7ba64bbf3e6a54058d120896443f36cc5e8 100644 GIT binary patch delta 3597 zcmYk9eQXp(7{+HPr3fvyqLxESj~)V10Z|YPG@c=9OM!w#Afi$=Rsf(eL#28C*T-rJRVCf%LiJoC=i`_63V zIvO)O8Z(<5tuM8#(JyZHJ)EsnH>EOmC?o(<#%`rTlbqZwBlT=2i1%D)@s@k7Dp+dN zwxf1OccpFzAA|WS$k);hRp<+7c>19j`UbCJa1eYLJP>SyPhp(p;Ck>m=tICw;LB+H zB{&7X4!u+*m)PqV{K0gXQ+=6cyxR1y8gYWQo92f&h178q~&eL21hNgN!uLMu< z*aklaMjKx}cnT)57n}k=2i~Vte^l30Dd&Tx5v2>jemqDui}3Q}$;IP&uKQ-_ufaDm z=haH}{RCZnE%nfCFnpp@;E0pk99Z}clnzt{p}dD24V4XEha3}C_2~QxS{wkUpf@1L z3=jlW(}))qq*!r!W}=T#2JEAeZ?d)kvg;{Yr&pmBK-=cp(E%C3paGakdQ}>Tv=*+GR^z6ModI zSPPyCmd5qqIml(=DR32X(NBXz!2MVZis4n5oB$RdM*EEK!ARi^5G36N-UJ)`DcKRU z;IjLabN16aixH^|_*L65ZO(K}VIJNM{h;&#-UB}5u|!ce7E<_tGp=PpPAANJpg3Cw zaGF{I#m7;CO}8Lyy#BjYg3l;)1l z5Ig~({Sxb;$oMbnXP%HC8WwdR*Ln%bOohHKl2eDQo}0?h({AYL0myRpV%UPm$-r0 z3za3ITG>Vw-xB{se1`Zk@jr$mImEP&BST0ZLtK{S+`Y5Nnn@DHB(j9`mBbG5hlZzC z8jetuA}yq+441EQIqieUep3-SL42P0FT>GH;@-r4iRrvWX>(SPKcc~`3W-5FMJH3^ zXNg}SUPhcC_EsYuMb|rPcLl7iR7<-vN+FIOC7ZK`%il5_{e$>l!?B*kwCwVFS5Nyj zRzyZ43{P8SI5wV`4r{EO^tr?fiRsY9=zA4&veM(H{Tkb78f4xu9P>_r7{#`ePG7AU zc_?)fVtgB;`rsXPmCK zfb|9`HN#`HlPc)*8~=^;F2e~r%L&@Zy{pNl$f^H4@caX$ zP<#?|jGnXGaAH33a$?&3i6rR_&fxaIDB6|@?(HkGJVJcBWm)^+Q0w~f`fzsg^%6Zx zSGVpj(S5a5pi~ELZ)!1~ZTR6ZR;h!ym4xK-l!Hf-=qr>uKL{)rL@9ry)P-BXez)BH zZ`@^saa)x7doC%REx$g3RF(@-iCeZd;pXkfok-ypCJoZUFfeX|H#@*BOjWw&6(^MT z%YgITvW0R`%HG6pmY2Epp#r6+$-)v|d92d2u`v0yp+_Niy^}mSNiWjX$;oB9Fi)=O slKSB$h96wk#$>2W&()pD>C^S#)~+&rHA`Pj{$8P%>cz=@~ delta 3600 zcmYk(V8H?*@klWt1n5D5@+fdT3^gK3Xa$5u(1=8hpcM&-=yz@x<~zyWe&#o`yYt%Z zv~@#f>xRsPCFLJY%jIjE+@rFENE0G`r$7Rb()S1vc;3ow93m%MKKxF%<}^NNRH3Fs z>3)>%&Jf~m_!O!yLHKIwLJ>HO*g+wBAshG-KgF;Q{ub^FoA7x&XAvBSFCsnwPQabG z_7I$eZy;VOgd3+I9wi0)Qx1L0i9L*YEfBVg217r1>e>ZM!)KL%$Z?uVan zY{JjMXq-C^m!lErggXiQ;m?KWjpAUHuvR4om(GGcIFN#K@Z-Uun&FIG#wNtyLW9*n zek4TC&A7!8RgQQK`~}7;2}i8lE#6rlBGQT?A0igcs;4Hr24^%>#BuXKamCkg67dF{ zXQU=U1UJIyr0jyYdLTwrq9vm{5pRJ#h$mp2I?8b#WY?iZg~ybvRUovx=XCL*P+#2HDV3JXryq53NQV9a1T@76Fy|!0S;* zHBhYx>TuaZ!uo0Jy~T~V4C5+IRBg(%tD<^nFXG>*d%%a_USVJQ9Zy)G;%*U`?7XtSZMh$*2#RV$9%sRa7q><4TAyikLm6+q~ZuVTL*e+FK zC$lHTJDec;ZuW8Asv^um+Jjx%<_Pjr7{fKAil5%j&(>s`EQPmI{GfLEa@(mf2p`i0;STaI8ll? z5sF;Yb+VeYBbUgx7YV-R(JbrnmWrYtx;RQNb|rm2qC=?8a4Xs3bw5jyO7bh#jTWy# zAB(6#rSFpI<&Qd(qPiO0Om&*chsg9kM9)&(`B&K~aDQPS&9k_9%_!NHT|c`- zPLlJQTT5h5Y2*uW6dO1(2gi2&?=Vt`Hf&4+^N`QMp^}ORh4`%>toDd1|Ctb%?}R;e zKI58@8Oqo&s`3XeVMFbPeUw*wq7plAtQRudgKd$(Moe9h3Jip?A-sKqhhe{+UwlT$ zLiIpwwo>&wgnUNjorW8?@(bBZAF16Vob$iU`K8MWRw_=ul9jHfB15IckoNtm!{{?{@z)b9cYgh<{WFJdLG0UoEEyNAfB%7@pPYToWgjUx1v=-GtU z{qTK4IDUdlG43WBbUB3lke;cJ5JzAi=!Z|iec=Fn240F&R=^kFEacVjRrni>4a0Zf zT;#%y)vXX>xDbZ$pwi8(h=Fh~%R}MbmPf*UE&Jgt%Q^6HxCiDegrBk;fS-f);ws7WxhID?Rw~4B-o6j49TuEAr9*SC;{Y0^hCHa zCCXXjH9khtg2a#3O|V1v!m$4NnNb-8KT1IMj2b3dE{>!F1>(qX;Y0FD8c*TW-B8$w^KXRL-T@I+V-uD~%J-i7xt zjv^~R3onOTbti186vmq)3Dqs|_c}+}4x=PJh?ku(F0o@I%o~0zPyP1Q{k{;Ik7HN> z^VFDK6VP2Wz$dKfBJfGehvB|x*X6j%uAS>_L(&Z=n2O|>3e|dj9abMd?oMp}Lijv# zJxu_<2B%_F04Mt!j#%>x<=o>kdKl)F^Yjy<7#}+iMh%5y_tp>hMjxF|hSMzWV%~)>5b~(K9|u<%_qsfk}r{OlEo^aOLVqg znwvv;0eLogCAo~enOvztb*|hhO13)fvo}%xBl#TpGWo7@ z?sploa{XQ8b&pki43#$lKJadY8M3l3H?D)zrI;R!Ta_7scmo8I&i=3oJHRcaXS!Zy;|cSCbEzo|dRj z-$o7R$bXXWk`we3ll2E9ja*EhmrS+oYwxVbsdWr_I=O*<+~-iflw3*zNs@-p zUuiu+2Ltg)t)Jjx6AZ$DLaW}n3@7wR@m8#k{@wlZ|Alv2^(9t&bYqjHSZ~#%8=ERc z3hKk8buUG*W1O5Xi-Wm&vR`T!9szafgX2`Z5Ice&?hr*PvakjP;11TRPnVKflAL00@G09T&ULW+( zH1&*Pj!o-eGvpB(oxII!7=MQdoSJZ&fLAf^PTs-3tXDE z#5HY+t6C}LAJwbm`<1CfN`;6K!o4R}v>!41;5U>f+)qwa0elvz8R6%QVziWKc>uM0 z;m3t={R}Tdzx$}r$6@3L^+?@>I0gs6AbbW+gH8AnJQq`j;OlU22=>_y!kM;n;r?(Zj9Cc3Y}MPBovQNo`WOi|<64PiTUQ|(bq5@akRR0p;m(vP z&vw7@4U#4#LDVYPg{z5k1FV-7!ue;C5GT>20j@wk4Oe~Bx5BswA_3O-L*Kt7`0h|4 z>X7^3D&#&m3$BGHz(Kfqm=MD;Wq5nCY*Z1fK#F+-`)tWf_*!~h8VA&A*W(~C~*1@A;-8qDFGQ17f2Asuq z{u{giZq|*krR6X#t^^dfxX4;SRy4Y#)WwP_K_us(-8o;n5VP>L^PyK3-2QBXuorD~J{nH5T?qGrGcbb* z_k&w8aswQ&bKPkq>_eL%JPFo6zb0G^{}PK0HlR>~!UHKV8otJ`e)J9SY8BaewvWq1 zQHahL&{a2bo6&2c4r}erXm?eSUEV-e~p@gURjtK?BCWw+m$K* zr`~aT&7{1PeD6J{O&R4Y$m_@^~SO8ZZk?=O0ZYIDew$@}P+`*g}n$;-*#k}Fh?1ODtV zCH3Sc@>%lb>XHM`l*ahx)od6dTioR{bLBWWyXHu)>?)0Pyx+$);~g41+=X*6%Cbd> z*|=IOwetBW_qd5SI=kHuD0jSLz5nQbZ7$d)g?`D?<=fBVUk1Dw2dSKaa;{z8jMo{T zTN*?eh6~n6Rf5#di53+IJi#|6h1}`BS?-y0HQ&^o`Kt zMK?A{iX=E&+K*F|pBW)%%Gu>P`7%BE7TyB&(G6!ytPq>ZOY-HEm{y~F%ov$b^DtlD YjE}uHT&fj6rk1-3k diff --git a/platformd/checkpoint/config.go b/platformd/checkpoint/config.go index 751ed676..d6f47d3e 100644 --- a/platformd/checkpoint/config.go +++ b/platformd/checkpoint/config.go @@ -33,5 +33,4 @@ type Config struct { ListenAddr string StatusRetentionPeriod time.Duration ContainerReadyTimeout time.Duration - WaitAfterServerInit time.Duration } diff --git a/platformd/checkpoint/service.go b/platformd/checkpoint/service.go index 0a932acd..f49e6906 100644 --- a/platformd/checkpoint/service.go +++ b/platformd/checkpoint/service.go @@ -274,17 +274,16 @@ func (s *ServiceImpl) checkpoint(ctx context.Context, id string, baseRef name.Re return fmt.Errorf("find netns: %w", err) } + // sockets in state TCP_TIME_WAIT and TCP_CLOSE_WAIT, will be a left-over + // because our ebpf programs will not be able to close them. + // + // in order to make restoring the checkpoint work, we have to specify + // the --tcp-close option in /etc/criu/crun.conf to make it work. + // see https://criu.org/CLI/opt/--tcp-close for reference. if err := s.sockHandler.DestroySocks(netnsPath); err != nil { return fmt.Errorf("kill sockets: %w", err) } - // wait a little after closing all sockets, as it seems that if we directly checkpoint we still get: - // - // Error (criu/sk-inet.c:191): inet: Connected TCP socket, consider using --tcp-established option. - // - // linux might take some time to close all sockets eventually. - time.Sleep(s.cfg.WaitAfterServerInit) - logger.InfoContext(ctx, "checkpointing container", "container_id", ctrID) if _, err := s.criService.CheckpointContainer(ctx, &runtimev1.CheckpointContainerRequest{ diff --git a/platformd/config.go b/platformd/config.go index 4d72970f..7f3361c6 100644 --- a/platformd/config.go +++ b/platformd/config.go @@ -34,7 +34,6 @@ type Config struct { ListenAddr string StatusRetentionPeriod time.Duration ContainerReadyTimeout time.Duration - WaitAfterServerInit time.Duration } ManagementSocketUID uint64 ManagementSocketGID uint64 diff --git a/platformd/server.go b/platformd/server.go index 082497de..7a9aff18 100644 --- a/platformd/server.go +++ b/platformd/server.go @@ -145,7 +145,7 @@ func (s *Server) Run(ctx context.Context, cfg Config) error { ListenAddr: cfg.CheckpointConfig.ListenAddr, StatusRetentionPeriod: cfg.CheckpointConfig.StatusRetentionPeriod, ContainerReadyTimeout: cfg.CheckpointConfig.ContainerReadyTimeout, - WaitAfterServerInit: cfg.CheckpointConfig.WaitAfterServerInit, + SocketDestructionTimeout: cfg.CheckpointConfig.SocketDestructionTimeout, }, criSvc, image.NewService(checkSvcLogger, cfg.RegistryUser, cfg.RegistryPass, "/tmp"), From 5ee53fd21b420b4816b6c73504c32b2a8ee3db22 Mon Sep 17 00:00:00 2001 From: yannic rieger Date: Sat, 23 May 2026 18:26:18 +0200 Subject: [PATCH 2/2] forgot --- platformd/server.go | 1 - 1 file changed, 1 deletion(-) diff --git a/platformd/server.go b/platformd/server.go index 7a9aff18..fd6529b1 100644 --- a/platformd/server.go +++ b/platformd/server.go @@ -145,7 +145,6 @@ func (s *Server) Run(ctx context.Context, cfg Config) error { ListenAddr: cfg.CheckpointConfig.ListenAddr, StatusRetentionPeriod: cfg.CheckpointConfig.StatusRetentionPeriod, ContainerReadyTimeout: cfg.CheckpointConfig.ContainerReadyTimeout, - SocketDestructionTimeout: cfg.CheckpointConfig.SocketDestructionTimeout, }, criSvc, image.NewService(checkSvcLogger, cfg.RegistryUser, cfg.RegistryPass, "/tmp"),