mirror of
https://github.com/Control-D-Inc/ctrld.git
synced 2026-02-03 22:18:39 +00:00
cmd/ctrld: improving ctrld stability on router
The current state of ctrld is very "high stakes" and easy to mess up, and is unforgiving when "ctrld start" failed. That would cause the router is in broken state, unrecoverable. This commit makes these changes to improve the state: - Moving router setup process after ctrld listeners are ready, so dnsmasq won't flood requests to ctrld even though the listeners are not ready to serve requests. - On router, when ctrld stopped, restore router DNS setup. That leaves the router in good state on reboot/startup, help removing the custom DNS server for NTP synchronization on some routers. - If self-check failed, uninstall ctrld to restore router to good state, prevent confusion that ctrld process is still running even though self-check reports it did not started.
This commit is contained in:
committed by
Cuong Manh Le
parent
32482809b7
commit
67e4afc06e
104
cmd/ctrld/cli.go
104
cmd/ctrld/cli.go
@@ -123,14 +123,14 @@ func initCLI() {
|
||||
|
||||
waitCh := make(chan struct{})
|
||||
stopCh := make(chan struct{})
|
||||
p := &prog{
|
||||
waitCh: waitCh,
|
||||
stopCh: stopCh,
|
||||
}
|
||||
if !daemon {
|
||||
// We need to call s.Run() as soon as possible to response to the OS manager, so it
|
||||
// can see ctrld is running and don't mark ctrld as failed service.
|
||||
go func() {
|
||||
p := &prog{
|
||||
waitCh: waitCh,
|
||||
stopCh: stopCh,
|
||||
}
|
||||
s, err := newService(p, svcConfig)
|
||||
if err != nil {
|
||||
mainLog.Fatal().Err(err).Msg("failed create new service")
|
||||
@@ -163,14 +163,11 @@ func initCLI() {
|
||||
// so it's able to log information in processCDFlags.
|
||||
initLogging()
|
||||
|
||||
if setupRouter {
|
||||
s, errCh := runDNSServerForNTPD(router.ListenAddress())
|
||||
if err := router.PreRun(); err != nil {
|
||||
mainLog.Fatal().Err(err).Msg("failed to perform router pre-start check")
|
||||
}
|
||||
if err := s.Shutdown(); err != nil && errCh != nil {
|
||||
mainLog.Fatal().Err(err).Msg("failed to shutdown dns server for ntpd")
|
||||
}
|
||||
// Processing --cd flag require connecting to ControlD API, which needs valid
|
||||
// time for validating server certificate. Some routers need NTP synchronization
|
||||
// to set the current time, so this check must happen before processCDFlags.
|
||||
if err := router.PreRun(); err != nil {
|
||||
mainLog.Fatal().Err(err).Msg("failed to perform router pre-run check")
|
||||
}
|
||||
|
||||
processCDFlags()
|
||||
@@ -207,20 +204,34 @@ func initCLI() {
|
||||
rootCertPool = certs.CACertPool()
|
||||
fallthrough
|
||||
case platform != "":
|
||||
mainLog.Debug().Msg("Router setup")
|
||||
err := router.Configure(&cfg)
|
||||
if errors.Is(err, router.ErrNotSupported) {
|
||||
if !router.IsSupported(platform) {
|
||||
unsupportedPlatformHelp(cmd)
|
||||
os.Exit(1)
|
||||
}
|
||||
if err != nil {
|
||||
mainLog.Fatal().Err(err).Msg("failed to configure router")
|
||||
}
|
||||
p.onStarted = append(p.onStarted, func() {
|
||||
mainLog.Debug().Msg("Router setup")
|
||||
if err := router.Configure(&cfg); err != nil {
|
||||
mainLog.Error().Err(err).Msg("could not configure router")
|
||||
}
|
||||
})
|
||||
p.onStopped = append(p.onStopped, func() {
|
||||
mainLog.Debug().Msg("Router cleanup")
|
||||
if err := router.Cleanup(svcConfig); err != nil {
|
||||
mainLog.Error().Err(err).Msg("could not cleanup router")
|
||||
}
|
||||
if err := router.Stop(); err != nil {
|
||||
mainLog.Error().Err(err).Msg("problem occurred while stopping router")
|
||||
}
|
||||
p.resetDNS()
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
close(waitCh)
|
||||
<-stopCh
|
||||
for _, f := range p.onStopped {
|
||||
f()
|
||||
}
|
||||
},
|
||||
}
|
||||
runCmd.Flags().BoolVarP(&daemon, "daemon", "d", false, "Run as daemon")
|
||||
@@ -303,12 +314,16 @@ func initCLI() {
|
||||
sc.Arguments = append(sc.Arguments, "--config="+defaultConfigFile)
|
||||
}
|
||||
|
||||
prog := &prog{}
|
||||
s, err := newService(prog, sc)
|
||||
p := &prog{}
|
||||
s, err := newService(p, sc)
|
||||
if err != nil {
|
||||
mainLog.Error().Msg(err.Error())
|
||||
return
|
||||
}
|
||||
|
||||
mainLog.Debug().Msg("cleaning up router before installing")
|
||||
_ = router.Cleanup(svcConfig)
|
||||
|
||||
tasks := []task{
|
||||
{s.Stop, false},
|
||||
{s.Uninstall, false},
|
||||
@@ -333,12 +348,10 @@ func initCLI() {
|
||||
mainLog.Notice().Msg("Service started")
|
||||
default:
|
||||
mainLog.Error().Msg("Service did not start, please check system/service log for details error")
|
||||
if runtime.GOOS == "linux" {
|
||||
prog.resetDNS()
|
||||
}
|
||||
uninstall(p, s)
|
||||
os.Exit(1)
|
||||
}
|
||||
prog.setDNS()
|
||||
p.setDNS()
|
||||
}
|
||||
},
|
||||
}
|
||||
@@ -454,29 +467,16 @@ func initCLI() {
|
||||
NOTE: Uninstalling will set DNS to values provided by DHCP.`,
|
||||
Args: cobra.NoArgs,
|
||||
Run: func(cmd *cobra.Command, args []string) {
|
||||
prog := &prog{}
|
||||
s, err := newService(prog, svcConfig)
|
||||
p := &prog{}
|
||||
s, err := newService(p, svcConfig)
|
||||
if err != nil {
|
||||
mainLog.Error().Msg(err.Error())
|
||||
return
|
||||
}
|
||||
tasks := []task{
|
||||
{s.Stop, false},
|
||||
{s.Uninstall, true},
|
||||
}
|
||||
initLogging()
|
||||
if doTasks(tasks) {
|
||||
if iface == "" {
|
||||
iface = "auto"
|
||||
}
|
||||
prog.resetDNS()
|
||||
mainLog.Debug().Msg("Router cleanup")
|
||||
if err := router.Cleanup(svcConfig); err != nil {
|
||||
mainLog.Warn().Err(err).Msg("could not cleanup router")
|
||||
}
|
||||
mainLog.Notice().Msg("Service uninstalled")
|
||||
return
|
||||
if iface == "" {
|
||||
iface = "auto"
|
||||
}
|
||||
uninstall(p, s)
|
||||
},
|
||||
}
|
||||
uninstallCmd.Flags().StringVarP(&iface, "iface", "", "", `Reset DNS setting for iface, use "auto" for the default gateway interface`)
|
||||
@@ -951,3 +951,23 @@ func tryReadingConfig(writeDefaultConfig bool) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func uninstall(p *prog, s service.Service) {
|
||||
tasks := []task{
|
||||
{s.Stop, false},
|
||||
{s.Uninstall, true},
|
||||
}
|
||||
initLogging()
|
||||
if doTasks(tasks) {
|
||||
// Stop already reset DNS on router.
|
||||
if router.Name() == "" {
|
||||
p.resetDNS()
|
||||
}
|
||||
mainLog.Debug().Msg("Router cleanup")
|
||||
// Stop already did router.Cleanup and report any error if happens,
|
||||
// ignoring error here to prevent false positive.
|
||||
_ = router.Cleanup(svcConfig)
|
||||
mainLog.Notice().Msg("Service uninstalled")
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,7 +5,9 @@ import (
|
||||
"crypto/rand"
|
||||
"encoding/hex"
|
||||
"fmt"
|
||||
"io"
|
||||
"net"
|
||||
"os"
|
||||
"runtime"
|
||||
"strconv"
|
||||
"strings"
|
||||
@@ -13,7 +15,9 @@ import (
|
||||
"time"
|
||||
|
||||
"github.com/miekg/dns"
|
||||
"go4.org/mem"
|
||||
"golang.org/x/sync/errgroup"
|
||||
"tailscale.com/util/lineread"
|
||||
|
||||
"github.com/Control-D-Inc/ctrld"
|
||||
"github.com/Control-D-Inc/ctrld/internal/dnscache"
|
||||
@@ -101,6 +105,12 @@ func (p *prog) serveDNS(listenerNum string) error {
|
||||
}
|
||||
}
|
||||
select {
|
||||
case err := <-errCh:
|
||||
return err
|
||||
case <-time.After(5 * time.Second):
|
||||
p.started <- struct{}{}
|
||||
}
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return nil
|
||||
case err := <-errCh:
|
||||
@@ -463,50 +473,37 @@ func runDNSServer(addr, network string, handler dns.Handler) (*dns.Server, <-cha
|
||||
return s, errCh
|
||||
}
|
||||
|
||||
// runDNSServerForNTPD starts a DNS server listening on router.ListenAddress(). It must only be called when ctrld
|
||||
// running on router, before router.PreRun() to serve DNS request for NTP synchronization. The caller must call
|
||||
// s.Shutdown() explicitly when NTP is synced successfully.
|
||||
func runDNSServerForNTPD(addr string) (*dns.Server, <-chan error) {
|
||||
if addr == "" {
|
||||
return &dns.Server{}, nil
|
||||
}
|
||||
dnsResolver := ctrld.NewBootstrapResolver()
|
||||
s := &dns.Server{
|
||||
Addr: addr,
|
||||
Net: "udp",
|
||||
Handler: dns.HandlerFunc(func(w dns.ResponseWriter, m *dns.Msg) {
|
||||
mainLog.Debug().Msg("Serving query for ntpd")
|
||||
resolveCtx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
if osUpstreamConfig.Timeout > 0 {
|
||||
timeoutCtx, cancel := context.WithTimeout(resolveCtx, time.Millisecond*time.Duration(osUpstreamConfig.Timeout))
|
||||
defer cancel()
|
||||
resolveCtx = timeoutCtx
|
||||
}
|
||||
answer, err := dnsResolver.Resolve(resolveCtx, m)
|
||||
if err != nil {
|
||||
mainLog.Error().Err(err).Msgf("could not resolve: %v", m)
|
||||
return
|
||||
}
|
||||
if err := w.WriteMsg(answer); err != nil {
|
||||
mainLog.Error().Err(err).Msg("runDNSServerForNTPD: failed to send DNS response")
|
||||
}
|
||||
}),
|
||||
// inContainer reports whether we're running in a container.
|
||||
//
|
||||
// Copied from https://github.com/tailscale/tailscale/blob/v1.42.0/hostinfo/hostinfo.go#L260
|
||||
// with modification for ctrld usage.
|
||||
func inContainer() bool {
|
||||
if runtime.GOOS != "linux" {
|
||||
return false
|
||||
}
|
||||
|
||||
waitLock := sync.Mutex{}
|
||||
waitLock.Lock()
|
||||
s.NotifyStartedFunc = waitLock.Unlock
|
||||
|
||||
errCh := make(chan error)
|
||||
go func() {
|
||||
defer close(errCh)
|
||||
if err := s.ListenAndServe(); err != nil {
|
||||
waitLock.Unlock()
|
||||
mainLog.Error().Err(err).Msgf("could not listen and serve on: %s", s.Addr)
|
||||
errCh <- err
|
||||
var ret bool
|
||||
if _, err := os.Stat("/.dockerenv"); err == nil {
|
||||
return true
|
||||
}
|
||||
if _, err := os.Stat("/run/.containerenv"); err == nil {
|
||||
// See https://github.com/cri-o/cri-o/issues/5461
|
||||
return true
|
||||
}
|
||||
lineread.File("/proc/1/cgroup", func(line []byte) error {
|
||||
if mem.Contains(mem.B(line), mem.S("/docker/")) ||
|
||||
mem.Contains(mem.B(line), mem.S("/lxc/")) {
|
||||
ret = true
|
||||
return io.EOF // arbitrary non-nil error to stop loop
|
||||
}
|
||||
}()
|
||||
waitLock.Lock()
|
||||
return s, errCh
|
||||
return nil
|
||||
})
|
||||
lineread.File("/proc/mounts", func(line []byte) error {
|
||||
if mem.Contains(mem.B(line), mem.S("fuse.lxcfs")) {
|
||||
ret = true
|
||||
return io.EOF
|
||||
}
|
||||
return nil
|
||||
})
|
||||
return ret
|
||||
}
|
||||
|
||||
@@ -39,6 +39,10 @@ type prog struct {
|
||||
cfg *ctrld.Config
|
||||
cache dnscache.Cacher
|
||||
sema semaphore
|
||||
|
||||
started chan struct{}
|
||||
onStarted []func()
|
||||
onStopped []func()
|
||||
}
|
||||
|
||||
func (p *prog) Start(s service.Service) error {
|
||||
@@ -51,6 +55,8 @@ func (p *prog) run() {
|
||||
// Wait the caller to signal that we can do our logic.
|
||||
<-p.waitCh
|
||||
p.preRun()
|
||||
numListeners := len(p.cfg.Listener)
|
||||
p.started = make(chan struct{}, numListeners)
|
||||
if p.cfg.Service.CacheEnable {
|
||||
cacher, err := dnscache.NewLRUCache(p.cfg.Service.CacheSize)
|
||||
if err != nil {
|
||||
@@ -143,20 +149,22 @@ func (p *prog) run() {
|
||||
}(listenerNum)
|
||||
}
|
||||
|
||||
for i := 0; i < numListeners; i++ {
|
||||
<-p.started
|
||||
}
|
||||
for _, f := range p.onStarted {
|
||||
f()
|
||||
}
|
||||
wg.Wait()
|
||||
}
|
||||
|
||||
func (p *prog) Stop(s service.Service) error {
|
||||
close(p.stopCh)
|
||||
if err := p.deAllocateIP(); err != nil {
|
||||
mainLog.Error().Err(err).Msg("de-allocate ip failed")
|
||||
return err
|
||||
}
|
||||
p.preStop()
|
||||
if err := router.Stop(); err != nil {
|
||||
mainLog.Warn().Err(err).Msg("problem occurred while stopping router")
|
||||
}
|
||||
mainLog.Info().Msg("Service stopped")
|
||||
close(p.stopCh)
|
||||
return nil
|
||||
}
|
||||
|
||||
|
||||
@@ -8,6 +8,11 @@ func (p *prog) preRun() {
|
||||
if !service.Interactive() {
|
||||
p.setDNS()
|
||||
}
|
||||
p.onStopped = append(p.onStopped, func() {
|
||||
if !service.Interactive() {
|
||||
p.resetDNS()
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func setDependencies(svc *service.Config) {}
|
||||
@@ -15,9 +20,3 @@ func setDependencies(svc *service.Config) {}
|
||||
func setWorkingDirectory(svc *service.Config, dir string) {
|
||||
svc.WorkingDirectory = dir
|
||||
}
|
||||
|
||||
func (p *prog) preStop() {
|
||||
if !service.Interactive() {
|
||||
p.resetDNS()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -18,5 +18,3 @@ func setDependencies(svc *service.Config) {
|
||||
}
|
||||
|
||||
func setWorkingDirectory(svc *service.Config, dir string) {}
|
||||
|
||||
func (p *prog) preStop() {}
|
||||
|
||||
@@ -31,5 +31,3 @@ func setDependencies(svc *service.Config) {
|
||||
func setWorkingDirectory(svc *service.Config, dir string) {
|
||||
svc.WorkingDirectory = dir
|
||||
}
|
||||
|
||||
func (p *prog) preStop() {}
|
||||
|
||||
@@ -12,5 +12,3 @@ func setWorkingDirectory(svc *service.Config, dir string) {
|
||||
// WorkingDirectory is not supported on Windows.
|
||||
svc.WorkingDirectory = dir
|
||||
}
|
||||
|
||||
func (p *prog) preStop() {}
|
||||
|
||||
@@ -118,7 +118,7 @@ func PreRun() (err error) {
|
||||
switch Name() {
|
||||
case Merlin, Tomato:
|
||||
// Wait until `ntp_ready=1` set.
|
||||
b := backoff.NewBackoff("PreStart", func(format string, args ...any) {}, 10*time.Second)
|
||||
b := backoff.NewBackoff("PreRun", func(format string, args ...any) {}, 10*time.Second)
|
||||
for {
|
||||
out, err := nvram("get", "ntp_ready")
|
||||
if err != nil {
|
||||
|
||||
Reference in New Issue
Block a user