cmd/ctrld: improving ctrld stability on router

The current state of ctrld is very "high stakes" and easy to mess up,
and is unforgiving when "ctrld start" failed. That would cause the
router is in broken state, unrecoverable.

This commit makes these changes to improve the state:

 - Moving router setup process after ctrld listeners are ready, so
   dnsmasq won't flood requests to ctrld even though the listeners are
   not ready to serve requests.

 - On router, when ctrld stopped, restore router DNS setup. That leaves
   the router in good state on reboot/startup, help removing the custom
   DNS server for NTP synchronization on some routers.

 - If self-check failed, uninstall ctrld to restore router to good
   state, prevent confusion that ctrld process is still running even
   though self-check reports it did not started.
This commit is contained in:
Cuong Manh Le
2023-06-16 20:56:21 +07:00
committed by Cuong Manh Le
parent 32482809b7
commit 67e4afc06e
8 changed files with 121 additions and 103 deletions

View File

@@ -123,14 +123,14 @@ func initCLI() {
waitCh := make(chan struct{})
stopCh := make(chan struct{})
p := &prog{
waitCh: waitCh,
stopCh: stopCh,
}
if !daemon {
// We need to call s.Run() as soon as possible to response to the OS manager, so it
// can see ctrld is running and don't mark ctrld as failed service.
go func() {
p := &prog{
waitCh: waitCh,
stopCh: stopCh,
}
s, err := newService(p, svcConfig)
if err != nil {
mainLog.Fatal().Err(err).Msg("failed create new service")
@@ -163,14 +163,11 @@ func initCLI() {
// so it's able to log information in processCDFlags.
initLogging()
if setupRouter {
s, errCh := runDNSServerForNTPD(router.ListenAddress())
if err := router.PreRun(); err != nil {
mainLog.Fatal().Err(err).Msg("failed to perform router pre-start check")
}
if err := s.Shutdown(); err != nil && errCh != nil {
mainLog.Fatal().Err(err).Msg("failed to shutdown dns server for ntpd")
}
// Processing --cd flag require connecting to ControlD API, which needs valid
// time for validating server certificate. Some routers need NTP synchronization
// to set the current time, so this check must happen before processCDFlags.
if err := router.PreRun(); err != nil {
mainLog.Fatal().Err(err).Msg("failed to perform router pre-run check")
}
processCDFlags()
@@ -207,20 +204,34 @@ func initCLI() {
rootCertPool = certs.CACertPool()
fallthrough
case platform != "":
mainLog.Debug().Msg("Router setup")
err := router.Configure(&cfg)
if errors.Is(err, router.ErrNotSupported) {
if !router.IsSupported(platform) {
unsupportedPlatformHelp(cmd)
os.Exit(1)
}
if err != nil {
mainLog.Fatal().Err(err).Msg("failed to configure router")
}
p.onStarted = append(p.onStarted, func() {
mainLog.Debug().Msg("Router setup")
if err := router.Configure(&cfg); err != nil {
mainLog.Error().Err(err).Msg("could not configure router")
}
})
p.onStopped = append(p.onStopped, func() {
mainLog.Debug().Msg("Router cleanup")
if err := router.Cleanup(svcConfig); err != nil {
mainLog.Error().Err(err).Msg("could not cleanup router")
}
if err := router.Stop(); err != nil {
mainLog.Error().Err(err).Msg("problem occurred while stopping router")
}
p.resetDNS()
})
}
}
close(waitCh)
<-stopCh
for _, f := range p.onStopped {
f()
}
},
}
runCmd.Flags().BoolVarP(&daemon, "daemon", "d", false, "Run as daemon")
@@ -303,12 +314,16 @@ func initCLI() {
sc.Arguments = append(sc.Arguments, "--config="+defaultConfigFile)
}
prog := &prog{}
s, err := newService(prog, sc)
p := &prog{}
s, err := newService(p, sc)
if err != nil {
mainLog.Error().Msg(err.Error())
return
}
mainLog.Debug().Msg("cleaning up router before installing")
_ = router.Cleanup(svcConfig)
tasks := []task{
{s.Stop, false},
{s.Uninstall, false},
@@ -333,12 +348,10 @@ func initCLI() {
mainLog.Notice().Msg("Service started")
default:
mainLog.Error().Msg("Service did not start, please check system/service log for details error")
if runtime.GOOS == "linux" {
prog.resetDNS()
}
uninstall(p, s)
os.Exit(1)
}
prog.setDNS()
p.setDNS()
}
},
}
@@ -454,29 +467,16 @@ func initCLI() {
NOTE: Uninstalling will set DNS to values provided by DHCP.`,
Args: cobra.NoArgs,
Run: func(cmd *cobra.Command, args []string) {
prog := &prog{}
s, err := newService(prog, svcConfig)
p := &prog{}
s, err := newService(p, svcConfig)
if err != nil {
mainLog.Error().Msg(err.Error())
return
}
tasks := []task{
{s.Stop, false},
{s.Uninstall, true},
}
initLogging()
if doTasks(tasks) {
if iface == "" {
iface = "auto"
}
prog.resetDNS()
mainLog.Debug().Msg("Router cleanup")
if err := router.Cleanup(svcConfig); err != nil {
mainLog.Warn().Err(err).Msg("could not cleanup router")
}
mainLog.Notice().Msg("Service uninstalled")
return
if iface == "" {
iface = "auto"
}
uninstall(p, s)
},
}
uninstallCmd.Flags().StringVarP(&iface, "iface", "", "", `Reset DNS setting for iface, use "auto" for the default gateway interface`)
@@ -951,3 +951,23 @@ func tryReadingConfig(writeDefaultConfig bool) {
}
}
}
func uninstall(p *prog, s service.Service) {
tasks := []task{
{s.Stop, false},
{s.Uninstall, true},
}
initLogging()
if doTasks(tasks) {
// Stop already reset DNS on router.
if router.Name() == "" {
p.resetDNS()
}
mainLog.Debug().Msg("Router cleanup")
// Stop already did router.Cleanup and report any error if happens,
// ignoring error here to prevent false positive.
_ = router.Cleanup(svcConfig)
mainLog.Notice().Msg("Service uninstalled")
return
}
}

View File

@@ -5,7 +5,9 @@ import (
"crypto/rand"
"encoding/hex"
"fmt"
"io"
"net"
"os"
"runtime"
"strconv"
"strings"
@@ -13,7 +15,9 @@ import (
"time"
"github.com/miekg/dns"
"go4.org/mem"
"golang.org/x/sync/errgroup"
"tailscale.com/util/lineread"
"github.com/Control-D-Inc/ctrld"
"github.com/Control-D-Inc/ctrld/internal/dnscache"
@@ -101,6 +105,12 @@ func (p *prog) serveDNS(listenerNum string) error {
}
}
select {
case err := <-errCh:
return err
case <-time.After(5 * time.Second):
p.started <- struct{}{}
}
select {
case <-ctx.Done():
return nil
case err := <-errCh:
@@ -463,50 +473,37 @@ func runDNSServer(addr, network string, handler dns.Handler) (*dns.Server, <-cha
return s, errCh
}
// runDNSServerForNTPD starts a DNS server listening on router.ListenAddress(). It must only be called when ctrld
// running on router, before router.PreRun() to serve DNS request for NTP synchronization. The caller must call
// s.Shutdown() explicitly when NTP is synced successfully.
func runDNSServerForNTPD(addr string) (*dns.Server, <-chan error) {
if addr == "" {
return &dns.Server{}, nil
}
dnsResolver := ctrld.NewBootstrapResolver()
s := &dns.Server{
Addr: addr,
Net: "udp",
Handler: dns.HandlerFunc(func(w dns.ResponseWriter, m *dns.Msg) {
mainLog.Debug().Msg("Serving query for ntpd")
resolveCtx, cancel := context.WithCancel(context.Background())
defer cancel()
if osUpstreamConfig.Timeout > 0 {
timeoutCtx, cancel := context.WithTimeout(resolveCtx, time.Millisecond*time.Duration(osUpstreamConfig.Timeout))
defer cancel()
resolveCtx = timeoutCtx
}
answer, err := dnsResolver.Resolve(resolveCtx, m)
if err != nil {
mainLog.Error().Err(err).Msgf("could not resolve: %v", m)
return
}
if err := w.WriteMsg(answer); err != nil {
mainLog.Error().Err(err).Msg("runDNSServerForNTPD: failed to send DNS response")
}
}),
// inContainer reports whether we're running in a container.
//
// Copied from https://github.com/tailscale/tailscale/blob/v1.42.0/hostinfo/hostinfo.go#L260
// with modification for ctrld usage.
func inContainer() bool {
if runtime.GOOS != "linux" {
return false
}
waitLock := sync.Mutex{}
waitLock.Lock()
s.NotifyStartedFunc = waitLock.Unlock
errCh := make(chan error)
go func() {
defer close(errCh)
if err := s.ListenAndServe(); err != nil {
waitLock.Unlock()
mainLog.Error().Err(err).Msgf("could not listen and serve on: %s", s.Addr)
errCh <- err
var ret bool
if _, err := os.Stat("/.dockerenv"); err == nil {
return true
}
if _, err := os.Stat("/run/.containerenv"); err == nil {
// See https://github.com/cri-o/cri-o/issues/5461
return true
}
lineread.File("/proc/1/cgroup", func(line []byte) error {
if mem.Contains(mem.B(line), mem.S("/docker/")) ||
mem.Contains(mem.B(line), mem.S("/lxc/")) {
ret = true
return io.EOF // arbitrary non-nil error to stop loop
}
}()
waitLock.Lock()
return s, errCh
return nil
})
lineread.File("/proc/mounts", func(line []byte) error {
if mem.Contains(mem.B(line), mem.S("fuse.lxcfs")) {
ret = true
return io.EOF
}
return nil
})
return ret
}

View File

@@ -39,6 +39,10 @@ type prog struct {
cfg *ctrld.Config
cache dnscache.Cacher
sema semaphore
started chan struct{}
onStarted []func()
onStopped []func()
}
func (p *prog) Start(s service.Service) error {
@@ -51,6 +55,8 @@ func (p *prog) run() {
// Wait the caller to signal that we can do our logic.
<-p.waitCh
p.preRun()
numListeners := len(p.cfg.Listener)
p.started = make(chan struct{}, numListeners)
if p.cfg.Service.CacheEnable {
cacher, err := dnscache.NewLRUCache(p.cfg.Service.CacheSize)
if err != nil {
@@ -143,20 +149,22 @@ func (p *prog) run() {
}(listenerNum)
}
for i := 0; i < numListeners; i++ {
<-p.started
}
for _, f := range p.onStarted {
f()
}
wg.Wait()
}
func (p *prog) Stop(s service.Service) error {
close(p.stopCh)
if err := p.deAllocateIP(); err != nil {
mainLog.Error().Err(err).Msg("de-allocate ip failed")
return err
}
p.preStop()
if err := router.Stop(); err != nil {
mainLog.Warn().Err(err).Msg("problem occurred while stopping router")
}
mainLog.Info().Msg("Service stopped")
close(p.stopCh)
return nil
}

View File

@@ -8,6 +8,11 @@ func (p *prog) preRun() {
if !service.Interactive() {
p.setDNS()
}
p.onStopped = append(p.onStopped, func() {
if !service.Interactive() {
p.resetDNS()
}
})
}
func setDependencies(svc *service.Config) {}
@@ -15,9 +20,3 @@ func setDependencies(svc *service.Config) {}
func setWorkingDirectory(svc *service.Config, dir string) {
svc.WorkingDirectory = dir
}
func (p *prog) preStop() {
if !service.Interactive() {
p.resetDNS()
}
}

View File

@@ -18,5 +18,3 @@ func setDependencies(svc *service.Config) {
}
func setWorkingDirectory(svc *service.Config, dir string) {}
func (p *prog) preStop() {}

View File

@@ -31,5 +31,3 @@ func setDependencies(svc *service.Config) {
func setWorkingDirectory(svc *service.Config, dir string) {
svc.WorkingDirectory = dir
}
func (p *prog) preStop() {}

View File

@@ -12,5 +12,3 @@ func setWorkingDirectory(svc *service.Config, dir string) {
// WorkingDirectory is not supported on Windows.
svc.WorkingDirectory = dir
}
func (p *prog) preStop() {}

View File

@@ -118,7 +118,7 @@ func PreRun() (err error) {
switch Name() {
case Merlin, Tomato:
// Wait until `ntp_ready=1` set.
b := backoff.NewBackoff("PreStart", func(format string, args ...any) {}, 10*time.Second)
b := backoff.NewBackoff("PreRun", func(format string, args ...any) {}, 10*time.Second)
for {
out, err := nvram("get", "ntp_ready")
if err != nil {