From 67e4afc06e75386572ffb09f20f36a394635f525 Mon Sep 17 00:00:00 2001 From: Cuong Manh Le Date: Fri, 16 Jun 2023 20:56:21 +0700 Subject: [PATCH] cmd/ctrld: improving ctrld stability on router The current state of ctrld is very "high stakes" and easy to mess up, and is unforgiving when "ctrld start" failed. That would cause the router is in broken state, unrecoverable. This commit makes these changes to improve the state: - Moving router setup process after ctrld listeners are ready, so dnsmasq won't flood requests to ctrld even though the listeners are not ready to serve requests. - On router, when ctrld stopped, restore router DNS setup. That leaves the router in good state on reboot/startup, help removing the custom DNS server for NTP synchronization on some routers. - If self-check failed, uninstall ctrld to restore router to good state, prevent confusion that ctrld process is still running even though self-check reports it did not started. --- cmd/ctrld/cli.go | 104 +++++++++++++++++++++++--------------- cmd/ctrld/dns_proxy.go | 83 +++++++++++++++--------------- cmd/ctrld/prog.go | 18 +++++-- cmd/ctrld/prog_darwin.go | 11 ++-- cmd/ctrld/prog_freebsd.go | 2 - cmd/ctrld/prog_linux.go | 2 - cmd/ctrld/prog_others.go | 2 - internal/router/router.go | 2 +- 8 files changed, 121 insertions(+), 103 deletions(-) diff --git a/cmd/ctrld/cli.go b/cmd/ctrld/cli.go index 85b7f85..5e16aae 100644 --- a/cmd/ctrld/cli.go +++ b/cmd/ctrld/cli.go @@ -123,14 +123,14 @@ func initCLI() { waitCh := make(chan struct{}) stopCh := make(chan struct{}) + p := &prog{ + waitCh: waitCh, + stopCh: stopCh, + } if !daemon { // We need to call s.Run() as soon as possible to response to the OS manager, so it // can see ctrld is running and don't mark ctrld as failed service. go func() { - p := &prog{ - waitCh: waitCh, - stopCh: stopCh, - } s, err := newService(p, svcConfig) if err != nil { mainLog.Fatal().Err(err).Msg("failed create new service") @@ -163,14 +163,11 @@ func initCLI() { // so it's able to log information in processCDFlags. initLogging() - if setupRouter { - s, errCh := runDNSServerForNTPD(router.ListenAddress()) - if err := router.PreRun(); err != nil { - mainLog.Fatal().Err(err).Msg("failed to perform router pre-start check") - } - if err := s.Shutdown(); err != nil && errCh != nil { - mainLog.Fatal().Err(err).Msg("failed to shutdown dns server for ntpd") - } + // Processing --cd flag require connecting to ControlD API, which needs valid + // time for validating server certificate. Some routers need NTP synchronization + // to set the current time, so this check must happen before processCDFlags. + if err := router.PreRun(); err != nil { + mainLog.Fatal().Err(err).Msg("failed to perform router pre-run check") } processCDFlags() @@ -207,20 +204,34 @@ func initCLI() { rootCertPool = certs.CACertPool() fallthrough case platform != "": - mainLog.Debug().Msg("Router setup") - err := router.Configure(&cfg) - if errors.Is(err, router.ErrNotSupported) { + if !router.IsSupported(platform) { unsupportedPlatformHelp(cmd) os.Exit(1) } - if err != nil { - mainLog.Fatal().Err(err).Msg("failed to configure router") - } + p.onStarted = append(p.onStarted, func() { + mainLog.Debug().Msg("Router setup") + if err := router.Configure(&cfg); err != nil { + mainLog.Error().Err(err).Msg("could not configure router") + } + }) + p.onStopped = append(p.onStopped, func() { + mainLog.Debug().Msg("Router cleanup") + if err := router.Cleanup(svcConfig); err != nil { + mainLog.Error().Err(err).Msg("could not cleanup router") + } + if err := router.Stop(); err != nil { + mainLog.Error().Err(err).Msg("problem occurred while stopping router") + } + p.resetDNS() + }) } } close(waitCh) <-stopCh + for _, f := range p.onStopped { + f() + } }, } runCmd.Flags().BoolVarP(&daemon, "daemon", "d", false, "Run as daemon") @@ -303,12 +314,16 @@ func initCLI() { sc.Arguments = append(sc.Arguments, "--config="+defaultConfigFile) } - prog := &prog{} - s, err := newService(prog, sc) + p := &prog{} + s, err := newService(p, sc) if err != nil { mainLog.Error().Msg(err.Error()) return } + + mainLog.Debug().Msg("cleaning up router before installing") + _ = router.Cleanup(svcConfig) + tasks := []task{ {s.Stop, false}, {s.Uninstall, false}, @@ -333,12 +348,10 @@ func initCLI() { mainLog.Notice().Msg("Service started") default: mainLog.Error().Msg("Service did not start, please check system/service log for details error") - if runtime.GOOS == "linux" { - prog.resetDNS() - } + uninstall(p, s) os.Exit(1) } - prog.setDNS() + p.setDNS() } }, } @@ -454,29 +467,16 @@ func initCLI() { NOTE: Uninstalling will set DNS to values provided by DHCP.`, Args: cobra.NoArgs, Run: func(cmd *cobra.Command, args []string) { - prog := &prog{} - s, err := newService(prog, svcConfig) + p := &prog{} + s, err := newService(p, svcConfig) if err != nil { mainLog.Error().Msg(err.Error()) return } - tasks := []task{ - {s.Stop, false}, - {s.Uninstall, true}, - } - initLogging() - if doTasks(tasks) { - if iface == "" { - iface = "auto" - } - prog.resetDNS() - mainLog.Debug().Msg("Router cleanup") - if err := router.Cleanup(svcConfig); err != nil { - mainLog.Warn().Err(err).Msg("could not cleanup router") - } - mainLog.Notice().Msg("Service uninstalled") - return + if iface == "" { + iface = "auto" } + uninstall(p, s) }, } uninstallCmd.Flags().StringVarP(&iface, "iface", "", "", `Reset DNS setting for iface, use "auto" for the default gateway interface`) @@ -951,3 +951,23 @@ func tryReadingConfig(writeDefaultConfig bool) { } } } + +func uninstall(p *prog, s service.Service) { + tasks := []task{ + {s.Stop, false}, + {s.Uninstall, true}, + } + initLogging() + if doTasks(tasks) { + // Stop already reset DNS on router. + if router.Name() == "" { + p.resetDNS() + } + mainLog.Debug().Msg("Router cleanup") + // Stop already did router.Cleanup and report any error if happens, + // ignoring error here to prevent false positive. + _ = router.Cleanup(svcConfig) + mainLog.Notice().Msg("Service uninstalled") + return + } +} diff --git a/cmd/ctrld/dns_proxy.go b/cmd/ctrld/dns_proxy.go index 776224f..626c329 100644 --- a/cmd/ctrld/dns_proxy.go +++ b/cmd/ctrld/dns_proxy.go @@ -5,7 +5,9 @@ import ( "crypto/rand" "encoding/hex" "fmt" + "io" "net" + "os" "runtime" "strconv" "strings" @@ -13,7 +15,9 @@ import ( "time" "github.com/miekg/dns" + "go4.org/mem" "golang.org/x/sync/errgroup" + "tailscale.com/util/lineread" "github.com/Control-D-Inc/ctrld" "github.com/Control-D-Inc/ctrld/internal/dnscache" @@ -101,6 +105,12 @@ func (p *prog) serveDNS(listenerNum string) error { } } select { + case err := <-errCh: + return err + case <-time.After(5 * time.Second): + p.started <- struct{}{} + } + select { case <-ctx.Done(): return nil case err := <-errCh: @@ -463,50 +473,37 @@ func runDNSServer(addr, network string, handler dns.Handler) (*dns.Server, <-cha return s, errCh } -// runDNSServerForNTPD starts a DNS server listening on router.ListenAddress(). It must only be called when ctrld -// running on router, before router.PreRun() to serve DNS request for NTP synchronization. The caller must call -// s.Shutdown() explicitly when NTP is synced successfully. -func runDNSServerForNTPD(addr string) (*dns.Server, <-chan error) { - if addr == "" { - return &dns.Server{}, nil - } - dnsResolver := ctrld.NewBootstrapResolver() - s := &dns.Server{ - Addr: addr, - Net: "udp", - Handler: dns.HandlerFunc(func(w dns.ResponseWriter, m *dns.Msg) { - mainLog.Debug().Msg("Serving query for ntpd") - resolveCtx, cancel := context.WithCancel(context.Background()) - defer cancel() - if osUpstreamConfig.Timeout > 0 { - timeoutCtx, cancel := context.WithTimeout(resolveCtx, time.Millisecond*time.Duration(osUpstreamConfig.Timeout)) - defer cancel() - resolveCtx = timeoutCtx - } - answer, err := dnsResolver.Resolve(resolveCtx, m) - if err != nil { - mainLog.Error().Err(err).Msgf("could not resolve: %v", m) - return - } - if err := w.WriteMsg(answer); err != nil { - mainLog.Error().Err(err).Msg("runDNSServerForNTPD: failed to send DNS response") - } - }), +// inContainer reports whether we're running in a container. +// +// Copied from https://github.com/tailscale/tailscale/blob/v1.42.0/hostinfo/hostinfo.go#L260 +// with modification for ctrld usage. +func inContainer() bool { + if runtime.GOOS != "linux" { + return false } - waitLock := sync.Mutex{} - waitLock.Lock() - s.NotifyStartedFunc = waitLock.Unlock - - errCh := make(chan error) - go func() { - defer close(errCh) - if err := s.ListenAndServe(); err != nil { - waitLock.Unlock() - mainLog.Error().Err(err).Msgf("could not listen and serve on: %s", s.Addr) - errCh <- err + var ret bool + if _, err := os.Stat("/.dockerenv"); err == nil { + return true + } + if _, err := os.Stat("/run/.containerenv"); err == nil { + // See https://github.com/cri-o/cri-o/issues/5461 + return true + } + lineread.File("/proc/1/cgroup", func(line []byte) error { + if mem.Contains(mem.B(line), mem.S("/docker/")) || + mem.Contains(mem.B(line), mem.S("/lxc/")) { + ret = true + return io.EOF // arbitrary non-nil error to stop loop } - }() - waitLock.Lock() - return s, errCh + return nil + }) + lineread.File("/proc/mounts", func(line []byte) error { + if mem.Contains(mem.B(line), mem.S("fuse.lxcfs")) { + ret = true + return io.EOF + } + return nil + }) + return ret } diff --git a/cmd/ctrld/prog.go b/cmd/ctrld/prog.go index 2f17f82..5e563bf 100644 --- a/cmd/ctrld/prog.go +++ b/cmd/ctrld/prog.go @@ -39,6 +39,10 @@ type prog struct { cfg *ctrld.Config cache dnscache.Cacher sema semaphore + + started chan struct{} + onStarted []func() + onStopped []func() } func (p *prog) Start(s service.Service) error { @@ -51,6 +55,8 @@ func (p *prog) run() { // Wait the caller to signal that we can do our logic. <-p.waitCh p.preRun() + numListeners := len(p.cfg.Listener) + p.started = make(chan struct{}, numListeners) if p.cfg.Service.CacheEnable { cacher, err := dnscache.NewLRUCache(p.cfg.Service.CacheSize) if err != nil { @@ -143,20 +149,22 @@ func (p *prog) run() { }(listenerNum) } + for i := 0; i < numListeners; i++ { + <-p.started + } + for _, f := range p.onStarted { + f() + } wg.Wait() } func (p *prog) Stop(s service.Service) error { + close(p.stopCh) if err := p.deAllocateIP(); err != nil { mainLog.Error().Err(err).Msg("de-allocate ip failed") return err } - p.preStop() - if err := router.Stop(); err != nil { - mainLog.Warn().Err(err).Msg("problem occurred while stopping router") - } mainLog.Info().Msg("Service stopped") - close(p.stopCh) return nil } diff --git a/cmd/ctrld/prog_darwin.go b/cmd/ctrld/prog_darwin.go index 2b82eb5..4d9ad0a 100644 --- a/cmd/ctrld/prog_darwin.go +++ b/cmd/ctrld/prog_darwin.go @@ -8,6 +8,11 @@ func (p *prog) preRun() { if !service.Interactive() { p.setDNS() } + p.onStopped = append(p.onStopped, func() { + if !service.Interactive() { + p.resetDNS() + } + }) } func setDependencies(svc *service.Config) {} @@ -15,9 +20,3 @@ func setDependencies(svc *service.Config) {} func setWorkingDirectory(svc *service.Config, dir string) { svc.WorkingDirectory = dir } - -func (p *prog) preStop() { - if !service.Interactive() { - p.resetDNS() - } -} diff --git a/cmd/ctrld/prog_freebsd.go b/cmd/ctrld/prog_freebsd.go index 24a90ba..63d8179 100644 --- a/cmd/ctrld/prog_freebsd.go +++ b/cmd/ctrld/prog_freebsd.go @@ -18,5 +18,3 @@ func setDependencies(svc *service.Config) { } func setWorkingDirectory(svc *service.Config, dir string) {} - -func (p *prog) preStop() {} diff --git a/cmd/ctrld/prog_linux.go b/cmd/ctrld/prog_linux.go index 0b49a33..36bc316 100644 --- a/cmd/ctrld/prog_linux.go +++ b/cmd/ctrld/prog_linux.go @@ -31,5 +31,3 @@ func setDependencies(svc *service.Config) { func setWorkingDirectory(svc *service.Config, dir string) { svc.WorkingDirectory = dir } - -func (p *prog) preStop() {} diff --git a/cmd/ctrld/prog_others.go b/cmd/ctrld/prog_others.go index b26c0b6..50fcf0d 100644 --- a/cmd/ctrld/prog_others.go +++ b/cmd/ctrld/prog_others.go @@ -12,5 +12,3 @@ func setWorkingDirectory(svc *service.Config, dir string) { // WorkingDirectory is not supported on Windows. svc.WorkingDirectory = dir } - -func (p *prog) preStop() {} diff --git a/internal/router/router.go b/internal/router/router.go index f9b8be8..4916db4 100644 --- a/internal/router/router.go +++ b/internal/router/router.go @@ -118,7 +118,7 @@ func PreRun() (err error) { switch Name() { case Merlin, Tomato: // Wait until `ntp_ready=1` set. - b := backoff.NewBackoff("PreStart", func(format string, args ...any) {}, 10*time.Second) + b := backoff.NewBackoff("PreRun", func(format string, args ...any) {}, 10*time.Second) for { out, err := nvram("get", "ntp_ready") if err != nil {