cmd/cli: make self-check process faster

The "ctrld start" command is running slow, and using much CPU than
necessary. The problem was made because of several things:

1. ctrld process is waiting for 5 seconds before marking listeners up.
   That ends up adding those seconds to the self-check process, even
   though the listeners may have been already available.

2. While creating socket control client, "s.Status()" is called to
   obtain ctrld service status, so we could terminate early if the
   service failed to run. However, that would make a lot of syscall in a
   hot loop, eating the CPU constantly while the command is running. On
   Windows, that call would become slower after each calls. The same
   effect could be seen using Windows services manager GUI, by pressing
   start/stop/restart button fast enough, we could see a timeout raised.

3. The socket control server is started lately, after all the listeners
   up. That would make the loop for creating socket control client run
   longer and use much resources than necessary.

Fixes for these problems are quite obvious:

1. Removing hard code 5 seconds waiting. NotifyStartedFunc is enough to
   ensure that listeners are ready for accepting requests.

2. Check "s.Status()" only once before the loop. There has been already
   30 seconds timeout, so if anything went wrong, the self-check process
   could be terminated, and won't hang forever.

3. Starting socket control server earlier, so newSocketControlClient can
   connect to server with fewest attempts, then querying "/started"
   endpoint to ensure the listeners have been ready.

With these fixes, "ctrld start" now run much faster on modern machines,
taking ~1-2 seconds (previously ~5-8 seconds) to finish. On dual cores
VM, it takes ~5-8 seconds (previously a few dozen seconds or timeout).

---

While at it, there are two refactoring for making the code easier to
read/maintain:

- PersistentPreRun is now used in root command to init console logging,
  so we don't have to initialize them in sub-commands.

- NotifyStartedFunc now use channel for synchronization, instead of a
  mutex, making the ugly asymetric calls to lock goes away, making the
  code more idiom, and theoretically have better performance.
This commit is contained in:
Cuong Manh Le
2024-05-06 14:27:56 +07:00
committed by Cuong Manh Le
parent f499770d45
commit a1fda2c0de
3 changed files with 19 additions and 35 deletions

View File

@@ -1598,7 +1598,7 @@ func selfCheckStatus(s service.Service) (bool, service.Status, error) {
dir, err := socketDir()
if err != nil {
mainLog.Load().Error().Err(err).Msg("failed to check ctrld listener status: could not get home directory")
mainLog.Load().Error().Err(err).Msg("failed to get ctrld listener status: could not get home directory")
return false, status, err
}
mainLog.Load().Debug().Msg("waiting for ctrld listener to be ready")
@@ -1607,17 +1607,6 @@ func selfCheckStatus(s service.Service) (bool, service.Status, error) {
return false, status, errors.New("could not connect to control server")
}
resp, err := cc.post(startedPath, nil)
if err != nil {
mainLog.Load().Error().Err(err).Msg("failed to connect to control server")
return false, status, err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
mainLog.Load().Error().Msg("ctrld listener is not ready")
return false, status, errors.New("ctrld listener is not ready")
}
// Not a ctrld upstream, return status as-is.
if cfg.FirstUpstream().VerifyDomain() == "" {
return true, status, nil
@@ -2261,6 +2250,10 @@ func removeProvTokenFromArgs(sc *service.Config) {
// newSocketControlClient returns new control client after control server was started.
func newSocketControlClient(s service.Service, dir string) *controlClient {
// Return early if service is not running.
if status, err := s.Status(); err != nil || status != service.StatusRunning {
return nil
}
bo := backoff.NewBackoff("self-check", logf, 10*time.Second)
bo.LogLongerThan = 10 * time.Second
ctx := context.Background()
@@ -2270,28 +2263,21 @@ func newSocketControlClient(s service.Service, dir string) *controlClient {
defer timeout.Stop()
// The socket control server may not start yet, so attempt to ping
// it until we got a response. For each iteration, check ctrld status
// to make sure ctrld is still running.
// it until we got a response.
for {
curStatus, err := s.Status()
if err != nil {
return nil
}
if curStatus != service.StatusRunning {
return nil
}
if _, err := cc.post("/", nil); err == nil {
_, err := cc.post(startedPath, nil)
if err == nil {
// Server was started, stop pinging.
break
}
// The socket control server is not ready yet, backoff for waiting it to be ready.
bo.BackOff(ctx, err)
select {
case <-timeout.C:
return nil
default:
}
continue
}
return cc

View File

@@ -210,12 +210,9 @@ func (p *prog) serveDNS(listenerNum string) error {
addr := net.JoinHostPort(listenerConfig.IP, strconv.Itoa(listenerConfig.Port))
s, errCh := runDNSServer(addr, proto, handler)
defer s.Shutdown()
select {
case err := <-errCh:
return err
case <-time.After(5 * time.Second):
p.started <- struct{}{}
}
p.started <- struct{}{}
select {
case <-p.stopCh:
case <-ctx.Done():

View File

@@ -249,6 +249,13 @@ func (p *prog) run(reload bool, reloadCh chan struct{}) {
numListeners := len(p.cfg.Listener)
if !reload {
p.started = make(chan struct{}, numListeners)
if p.cs != nil {
p.registerControlServerHandler()
if err := p.cs.start(); err != nil {
mainLog.Load().Warn().Err(err).Msg("could not start control server")
}
mainLog.Load().Debug().Msgf("control server started: %s", p.cs.addr)
}
}
p.onStartedDone = make(chan struct{})
p.loop = make(map[string]bool)
@@ -381,12 +388,6 @@ func (p *prog) run(reload bool, reloadCh chan struct{}) {
if p.logConn != nil {
_ = p.logConn.Close()
}
if p.cs != nil {
p.registerControlServerHandler()
if err := p.cs.start(); err != nil {
mainLog.Load().Warn().Err(err).Msg("could not start control server")
}
}
}
wg.Wait()
}