mirror of
https://github.com/Control-D-Inc/ctrld.git
synced 2026-02-03 22:18:39 +00:00
This commit extends the documentation effort by adding detailed explanatory comments to key CLI components and core functionality throughout the cmd/ directory. The changes focus on explaining WHY certain logic is needed, not just WHAT the code does, improving code maintainability and helping developers understand complex business decisions. Key improvements: - Main entry points: Document CLI initialization, logging setup, and cache configuration with reasoning for design decisions - DNS proxy core: Explain DNS proxy constants, data structures, and core processing pipeline for handling DNS queries - Service management: Document service command structure, configuration patterns, and platform-specific service handling - Logging infrastructure: Explain log buffer management, level encoders, and log formatting decisions for different use cases - Metrics and monitoring: Document Prometheus metrics structure, HTTP endpoints, and conditional metric collection for performance - Network handling: Explain Linux-specific network interface filtering, virtual interface detection, and DNS configuration management - Hostname validation: Document RFC1123 compliance and DNS naming standards for system compatibility - Mobile integration: Explain HTTP retry logic, fallback mechanisms, and mobile platform integration patterns - Connection management: Document connection wrapper design to prevent log pollution during process lifecycle Technical details: - Added explanatory comments to 11 additional files in cmd/cli/ - Maintained consistent documentation style and format - Preserved all existing functionality while improving code clarity - Enhanced understanding of complex business logic and platform-specific behavior These comments help future developers understand the reasoning behind complex decisions, making the codebase more maintainable and reducing the risk of incorrect modifications during maintenance.
131 lines
4.0 KiB
Go
131 lines
4.0 KiB
Go
package cli
|
|
|
|
import (
|
|
"sync"
|
|
"sync/atomic"
|
|
"time"
|
|
|
|
"github.com/Control-D-Inc/ctrld"
|
|
)
|
|
|
|
const (
|
|
// maxFailureRequest is the maximum failed queries allowed before an upstream is marked as down.
|
|
maxFailureRequest = 50
|
|
// checkUpstreamBackoffSleep is the time interval between each upstream checks.
|
|
checkUpstreamBackoffSleep = 2 * time.Second
|
|
)
|
|
|
|
// upstreamMonitor performs monitoring upstreams health.
|
|
type upstreamMonitor struct {
|
|
cfg *ctrld.Config
|
|
logger atomic.Pointer[ctrld.Logger]
|
|
|
|
mu sync.RWMutex
|
|
checking map[string]bool
|
|
down map[string]bool
|
|
failureReq map[string]uint64
|
|
recovered map[string]bool
|
|
|
|
// failureTimerActive tracks if a timer is already running for a given upstream.
|
|
failureTimerActive map[string]bool
|
|
}
|
|
|
|
// newUpstreamMonitor creates a new upstream monitor instance
|
|
func newUpstreamMonitor(cfg *ctrld.Config, logger *ctrld.Logger) *upstreamMonitor {
|
|
um := &upstreamMonitor{
|
|
cfg: cfg,
|
|
checking: make(map[string]bool),
|
|
down: make(map[string]bool),
|
|
failureReq: make(map[string]uint64),
|
|
recovered: make(map[string]bool),
|
|
failureTimerActive: make(map[string]bool),
|
|
}
|
|
um.logger.Store(logger)
|
|
for n := range cfg.Upstream {
|
|
upstream := upstreamPrefix + n
|
|
um.reset(upstream)
|
|
}
|
|
um.reset(upstreamOS)
|
|
return um
|
|
}
|
|
|
|
// increaseFailureCount increases failed queries count for an upstream by 1 and logs debug information.
|
|
// It uses a timer to debounce failure detection, ensuring that an upstream is marked as down
|
|
// within 10 seconds if failures persist, without spawning duplicate goroutines.
|
|
func (um *upstreamMonitor) increaseFailureCount(upstream string) {
|
|
um.mu.Lock()
|
|
defer um.mu.Unlock()
|
|
|
|
if um.recovered[upstream] {
|
|
um.logger.Load().Debug().Msgf("upstream %q is recovered, skipping failure count increase", upstream)
|
|
return
|
|
}
|
|
|
|
um.failureReq[upstream] += 1
|
|
failedCount := um.failureReq[upstream]
|
|
|
|
// Log the updated failure count.
|
|
um.logger.Load().Debug().Msgf("upstream %q failure count updated to %d", upstream, failedCount)
|
|
|
|
// If this is the first failure and no timer is running, start a 10-second timer.
|
|
if failedCount == 1 && !um.failureTimerActive[upstream] {
|
|
um.failureTimerActive[upstream] = true
|
|
go func(upstream string) {
|
|
time.Sleep(10 * time.Second)
|
|
um.mu.Lock()
|
|
defer um.mu.Unlock()
|
|
// If no success occurred during the 10-second window (i.e. counter remains > 0)
|
|
// and the upstream is not in a recovered state, mark it as down.
|
|
if um.failureReq[upstream] > 0 && !um.recovered[upstream] {
|
|
um.down[upstream] = true
|
|
um.logger.Load().Warn().Msgf("upstream %q marked as down after 10 seconds (failure count: %d)", upstream, um.failureReq[upstream])
|
|
}
|
|
// Reset the timer flag so that a new timer can be spawned if needed.
|
|
um.failureTimerActive[upstream] = false
|
|
}(upstream)
|
|
}
|
|
|
|
// If the failure count quickly reaches the threshold, mark the upstream as down immediately.
|
|
if failedCount >= maxFailureRequest {
|
|
um.down[upstream] = true
|
|
um.logger.Load().Warn().Msgf("upstream %q marked as down immediately (failure count: %d)", upstream, failedCount)
|
|
}
|
|
}
|
|
|
|
// isDown reports whether the given upstream is being marked as down.
|
|
func (um *upstreamMonitor) isDown(upstream string) bool {
|
|
um.mu.Lock()
|
|
defer um.mu.Unlock()
|
|
|
|
return um.down[upstream]
|
|
}
|
|
|
|
// reset marks an upstream as up and set failed queries counter to zero.
|
|
func (um *upstreamMonitor) reset(upstream string) {
|
|
um.mu.Lock()
|
|
um.failureReq[upstream] = 0
|
|
um.down[upstream] = false
|
|
um.recovered[upstream] = true
|
|
um.mu.Unlock()
|
|
go func() {
|
|
// debounce the recovery to avoid incrementing failure counts already in flight
|
|
time.Sleep(1 * time.Second)
|
|
um.mu.Lock()
|
|
um.recovered[upstream] = false
|
|
um.mu.Unlock()
|
|
}()
|
|
}
|
|
|
|
// countHealthy returns the number of upstreams in the provided map that are considered healthy.
|
|
func (um *upstreamMonitor) countHealthy(upstreams []string) int {
|
|
var count int
|
|
um.mu.RLock()
|
|
for _, upstream := range upstreams {
|
|
if !um.down[upstream] {
|
|
count++
|
|
}
|
|
}
|
|
um.mu.RUnlock()
|
|
return count
|
|
}
|