Add prometheus exporter

Updates #6
This commit is contained in:
Cuong Manh Le
2024-01-09 01:42:50 +07:00
committed by Cuong Manh Le
parent 44352f8006
commit 71f26a6d81
12 changed files with 363 additions and 18 deletions
+12 -1
View File
@@ -719,6 +719,10 @@ NOTE: Uninstalling will set DNS to values provided by DHCP.`,
sort.Strings(s)
return s
}
// If metrics is enabled, server set this for all clients, so we can check only the first one.
// Ideally, we may have a field in response to indicate that query count should be shown, but
// it would break earlier version of ctrld, which only look list of clients in response.
withQueryCount := len(clients) > 0 && clients[0].IncludeQueryCount
data := make([][]string, len(clients))
for i, c := range clients {
row := []string{
@@ -727,10 +731,17 @@ NOTE: Uninstalling will set DNS to values provided by DHCP.`,
c.Mac,
strings.Join(map2Slice(c.Source), ","),
}
if withQueryCount {
row = append(row, strconv.FormatInt(c.QueryCount, 10))
}
data[i] = row
}
table := tablewriter.NewWriter(os.Stdout)
table.SetHeader([]string{"IP", "Hostname", "Mac", "Discovered"})
headers := []string{"IP", "Hostname", "Mac", "Discovered"}
if withQueryCount {
headers = append(headers, "Queries")
}
table.SetHeader(headers)
table.SetAutoFormatHeaders(false)
table.AppendBulk(data)
table.Render()
+21
View File
@@ -10,6 +10,8 @@ import (
"sort"
"time"
dto "github.com/prometheus/client_model/go"
"github.com/Control-D-Inc/ctrld"
)
@@ -66,6 +68,25 @@ func (p *prog) registerControlServerHandler() {
sort.Slice(clients, func(i, j int) bool {
return clients[i].IP.Less(clients[j].IP)
})
if p.cfg.Service.MetricsQueryStats {
for _, client := range clients {
client.IncludeQueryCount = true
dm := &dto.Metric{}
m, err := statsClientQueriesCount.MetricVec.GetMetricWithLabelValues(
client.IP.String(),
client.Mac,
client.Hostname,
)
if err != nil {
mainLog.Load().Debug().Err(err).Msgf("could not get metrics for client: %v", client)
continue
}
if err := m.Write(dm); err == nil {
client.QueryCount = int64(dm.Counter.GetValue())
}
}
}
if err := json.NewEncoder(w).Encode(&clients); err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
+55 -11
View File
@@ -54,6 +54,14 @@ type proxyRequest struct {
ufr *upstreamForResult
}
// proxyResponse contains data for proxying a DNS response from upstream.
type proxyResponse struct {
answer *dns.Msg
cached bool
clientInfo bool
upstream string
}
// upstreamForResult represents the result of processing rules for a request.
type upstreamForResult struct {
upstreams []string
@@ -101,26 +109,49 @@ func (p *prog) serveDNS(listenerNum string) error {
fmtSrcToDest := fmtRemoteToLocal(listenerNum, ci.Hostname, remoteAddr.String())
t := time.Now()
ctrld.Log(ctx, mainLog.Load().Info(), "QUERY: %s: %s %s", fmtSrcToDest, dns.TypeToString[q.Qtype], domain)
res := p.upstreamFor(ctx, listenerNum, listenerConfig, remoteAddr, ci.Mac, domain)
ur := p.upstreamFor(ctx, listenerNum, listenerConfig, remoteAddr, ci.Mac, domain)
labelValues := make([]string, 0, len(statsQueriesCountLabels))
labelValues = append(labelValues, net.JoinHostPort(listenerConfig.IP, strconv.Itoa(listenerConfig.Port)))
labelValues = append(labelValues, ci.IP)
labelValues = append(labelValues, ci.Mac)
labelValues = append(labelValues, ci.Hostname)
var answer *dns.Msg
if !res.matched && listenerConfig.Restricted {
if !ur.matched && listenerConfig.Restricted {
ctrld.Log(ctx, mainLog.Load().Info(), "query refused, %s does not match any network policy", remoteAddr.String())
answer = new(dns.Msg)
answer.SetRcode(m, dns.RcodeRefused)
labelValues = append(labelValues, "") // no upstream
} else {
var failoverRcode []int
if listenerConfig.Policy != nil {
failoverRcode = listenerConfig.Policy.FailoverRcodeNumbers
}
answer = p.proxy(ctx, &proxyRequest{
pr := p.proxy(ctx, &proxyRequest{
msg: m,
ci: ci,
failoverRcodes: failoverRcode,
ufr: res,
ufr: ur,
})
answer = pr.answer
rtt := time.Since(t)
ctrld.Log(ctx, mainLog.Load().Debug(), "received response of %d bytes in %s", answer.Len(), rtt)
upstream := pr.upstream
switch {
case pr.cached:
upstream = "cache"
case pr.clientInfo:
upstream = "client_info_table"
}
labelValues = append(labelValues, upstream)
}
labelValues = append(labelValues, dns.TypeToString[q.Qtype])
labelValues = append(labelValues, dns.RcodeToString[answer.Rcode])
go func() {
p.WithLabelValuesInc(statsQueriesCount, labelValues...)
p.WithLabelValuesInc(statsClientQueriesCount, []string{ci.IP, ci.Mac, ci.Hostname}...)
}()
if err := w.WriteMsg(answer); err != nil {
ctrld.Log(ctx, mainLog.Load().Error().Err(err), "serveDNS: failed to send DNS response to client")
}
@@ -360,7 +391,7 @@ func (p *prog) proxyLanHostnameQuery(ctx context.Context, msg *dns.Msg) *dns.Msg
return nil
}
func (p *prog) proxy(ctx context.Context, req *proxyRequest) *dns.Msg {
func (p *prog) proxy(ctx context.Context, req *proxyRequest) *proxyResponse {
var staleAnswer *dns.Msg
upstreams := req.ufr.upstreams
serveStaleCache := p.cache != nil && p.cfg.Service.CacheServeStale
@@ -370,6 +401,8 @@ func (p *prog) proxy(ctx context.Context, req *proxyRequest) *dns.Msg {
upstreams = []string{upstreamOS}
}
res := &proxyResponse{}
// LAN/PTR lookup flow:
//
// 1. If there's matching rule, follow it.
@@ -384,14 +417,18 @@ func (p *prog) proxy(ctx context.Context, req *proxyRequest) *dns.Msg {
case isPrivatePtrLookup(req.msg):
isLanOrPtrQuery = true
if answer := p.proxyPrivatePtrLookup(ctx, req.msg); answer != nil {
return answer
res.answer = answer
res.clientInfo = true
return res
}
upstreams, upstreamConfigs = p.upstreamsAndUpstreamConfigForLanAndPtr(upstreams, upstreamConfigs)
ctrld.Log(ctx, mainLog.Load().Debug(), "private PTR lookup, using upstreams: %v", upstreams)
case isLanHostnameQuery(req.msg):
isLanOrPtrQuery = true
if answer := p.proxyLanHostnameQuery(ctx, req.msg); answer != nil {
return answer
res.answer = answer
res.clientInfo = true
return res
}
upstreams, upstreamConfigs = p.upstreamsAndUpstreamConfigForLanAndPtr(upstreams, upstreamConfigs)
ctrld.Log(ctx, mainLog.Load().Debug(), "lan hostname lookup, using upstreams: %v", upstreams)
@@ -413,7 +450,9 @@ func (p *prog) proxy(ctx context.Context, req *proxyRequest) *dns.Msg {
if cachedValue.Expire.After(now) {
ctrld.Log(ctx, mainLog.Load().Debug(), "hit cached response")
setCachedAnswerTTL(answer, now, cachedValue.Expire)
return answer
res.answer = answer
res.cached = true
return res
}
staleAnswer = answer
}
@@ -475,7 +514,9 @@ func (p *prog) proxy(ctx context.Context, req *proxyRequest) *dns.Msg {
ctrld.Log(ctx, mainLog.Load().Debug(), "serving stale cached response")
now := time.Now()
setCachedAnswerTTL(staleAnswer, now, now.Add(staleTTL))
return staleAnswer
res.answer = staleAnswer
res.cached = true
return res
}
continue
}
@@ -509,12 +550,15 @@ func (p *prog) proxy(ctx context.Context, req *proxyRequest) *dns.Msg {
hostname = req.ci.Hostname
}
ctrld.Log(ctx, mainLog.Load().Info(), "REPLY: %s -> %s (%s): %s", upstreams[n], req.ufr.srcAddr, hostname, dns.RcodeToString[answer.Rcode])
return answer
res.answer = answer
res.upstream = upstreamConfig.Endpoint
return res
}
ctrld.Log(ctx, mainLog.Load().Error(), "all %v endpoints failed", upstreams)
answer := new(dns.Msg)
answer.SetRcode(req.msg, dns.RcodeServerFailure)
return answer
res.answer = answer
return res
}
func (p *prog) upstreamsAndUpstreamConfigForLanAndPtr(upstreams []string, upstreamConfigs []*ctrld.UpstreamConfig) ([]string, []*ctrld.UpstreamConfig) {
+2 -2
View File
@@ -187,8 +187,8 @@ func TestCache(t *testing.T) {
got1 := prog.proxy(context.Background(), req1)
got2 := prog.proxy(context.Background(), req2)
assert.NotSame(t, got1, got2)
assert.Equal(t, answer1.Rcode, got1.Rcode)
assert.Equal(t, answer2.Rcode, got2.Rcode)
assert.Equal(t, answer1.Rcode, got1.answer.Rcode)
assert.Equal(t, answer2.Rcode, got2.answer.Rcode)
}
func Test_ipAndMacFromMsg(t *testing.T) {
+150
View File
@@ -0,0 +1,150 @@
package cli
import (
"context"
"encoding/json"
"net"
"net/http"
"runtime"
"time"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/collectors"
"github.com/prometheus/client_golang/prometheus/promhttp"
"github.com/prometheus/prom2json"
)
// metricsServer represents a server to expose Prometheus metrics via HTTP.
type metricsServer struct {
server *http.Server
mux *http.ServeMux
reg *prometheus.Registry
addr string
started bool
}
// newMetricsServer returns new metrics server.
func newMetricsServer(addr string, reg *prometheus.Registry) (*metricsServer, error) {
mux := http.NewServeMux()
ms := &metricsServer{
server: &http.Server{Handler: mux},
mux: mux,
reg: reg,
}
ms.addr = addr
ms.registerMetricsServerHandler()
return ms, nil
}
// register adds handlers for given pattern.
func (ms *metricsServer) register(pattern string, handler http.Handler) {
ms.mux.Handle(pattern, handler)
}
// registerMetricsServerHandler adds handlers for metrics server.
func (ms *metricsServer) registerMetricsServerHandler() {
ms.register("/metrics", promhttp.HandlerFor(
ms.reg,
promhttp.HandlerOpts{
EnableOpenMetrics: true,
Timeout: 10 * time.Second,
},
))
ms.register("/metrics/json", jsonResponse(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
g := prometheus.ToTransactionalGatherer(ms.reg)
mfs, done, err := g.Gather()
defer done()
if err != nil {
msg := "could not gather metrics"
mainLog.Load().Warn().Err(err).Msg(msg)
http.Error(w, msg, http.StatusInternalServerError)
return
}
result := make([]*prom2json.Family, 0, len(mfs))
for _, mf := range mfs {
result = append(result, prom2json.NewFamily(mf))
}
if err := json.NewEncoder(w).Encode(result); err != nil {
msg := "could not marshal metrics result"
mainLog.Load().Warn().Err(err).Msg(msg)
http.Error(w, msg, http.StatusInternalServerError)
return
}
})))
}
// start runs the metricsServer.
func (ms *metricsServer) start() error {
listener, err := net.Listen("tcp", ms.addr)
if err != nil {
return err
}
go ms.server.Serve(listener)
ms.started = true
return nil
}
// stop shutdowns the metricsServer within 2 seconds timeout.
func (ms *metricsServer) stop() error {
if !ms.started {
return nil
}
ctx, cancel := context.WithTimeout(context.Background(), time.Second*1)
defer cancel()
return ms.server.Shutdown(ctx)
}
// runMetricsServer initializes metrics stats and runs the metrics server if enabled.
func (p *prog) runMetricsServer(ctx context.Context, reloadCh chan struct{}) {
if !p.metricsEnabled() {
return
}
// Reset all stats.
statsVersion.Reset()
statsQueriesCount.Reset()
statsClientQueriesCount.Reset()
reg := prometheus.NewRegistry()
// Register queries count stats if enabled.
if cfg.Service.MetricsQueryStats {
reg.MustRegister(statsQueriesCount)
reg.MustRegister(statsClientQueriesCount)
}
addr := p.cfg.Service.MetricsListener
ms, err := newMetricsServer(addr, reg)
if err != nil {
mainLog.Load().Warn().Err(err).Msg("could not create new metrics server")
return
}
// Only start listener address if defined.
if addr != "" {
// Go runtime stats.
reg.MustRegister(collectors.NewBuildInfoCollector())
reg.MustRegister(collectors.NewGoCollector(
collectors.WithGoCollectorRuntimeMetrics(collectors.MetricsAll),
))
// ctrld stats.
reg.MustRegister(statsVersion)
statsVersion.WithLabelValues(commit, runtime.Version(), curVersion()).Inc()
reg.MustRegister(statsTimeStart)
statsTimeStart.Set(float64(time.Now().Unix()))
mainLog.Load().Debug().Msgf("starting metrics server on: %s", addr)
if err := ms.start(); err != nil {
mainLog.Load().Warn().Err(err).Msg("could not start metrics server")
return
}
}
select {
case <-p.stopCh:
case <-ctx.Done():
case <-reloadCh:
}
if err := ms.stop(); err != nil {
mainLog.Load().Warn().Err(err).Msg("could not stop metrics server")
return
}
}
+12
View File
@@ -348,6 +348,13 @@ func (p *prog) run(reload bool, reloadCh chan struct{}) {
p.checkDnsLoopTicker(ctx)
}()
wg.Add(1)
// Prometheus exporter goroutine.
go func() {
defer wg.Done()
p.runMetricsServer(ctx, reloadCh)
}()
if !reload {
// Stop writing log to unix socket.
consoleWriter.Out = os.Stdout
@@ -365,6 +372,11 @@ func (p *prog) run(reload bool, reloadCh chan struct{}) {
wg.Wait()
}
// metricsEnabled reports whether prometheus exporter is enabled/disabled.
func (p *prog) metricsEnabled() bool {
return p.cfg.Service.MetricsQueryStats || p.cfg.Service.MetricsListener != ""
}
func (p *prog) Stop(s service.Service) error {
mainLog.Load().Info().Msg("Service stopped")
close(p.stopCh)
+57
View File
@@ -0,0 +1,57 @@
package cli
import "github.com/prometheus/client_golang/prometheus"
const (
metricsLabelListener = "listener"
metricsLabelClientSourceIP = "client_source_ip"
metricsLabelClientMac = "client_mac"
metricsLabelClientHostname = "client_hostname"
metricsLabelUpstream = "upstream"
metricsLabelRRType = "rr_type"
metricsLabelRCode = "rcode"
)
// statsVersion represent ctrld version.
var statsVersion = prometheus.NewCounterVec(prometheus.CounterOpts{
Name: "ctrld_build_info",
Help: "Version of ctrld process.",
}, []string{"gitref", "goversion", "version"})
// statsTimeStart represents start time of ctrld service.
var statsTimeStart = prometheus.NewGauge(prometheus.GaugeOpts{
Name: "ctrld_time_seconds",
Help: "Start time of the ctrld process since unix epoch in seconds.",
})
var statsQueriesCountLabels = []string{
metricsLabelListener,
metricsLabelClientSourceIP,
metricsLabelClientMac,
metricsLabelClientHostname,
metricsLabelUpstream,
metricsLabelRRType,
metricsLabelRCode,
}
// statsQueriesCount counts total number of queries.
var statsQueriesCount = prometheus.NewCounterVec(prometheus.CounterOpts{
Name: "ctrld_queries_count",
Help: "Total number of queries.",
}, statsQueriesCountLabels)
// statsClientQueriesCount counts total number of queries of a client.
//
// The labels "client_source_ip", "client_mac", "client_hostname" are unbounded,
// thus this stat is highly inefficient if there are many devices.
var statsClientQueriesCount = prometheus.NewCounterVec(prometheus.CounterOpts{
Name: "ctrld_client_queries_count",
Help: "Total number queries of a client.",
}, []string{metricsLabelClientSourceIP, metricsLabelClientMac, metricsLabelClientHostname})
// WithLabelValuesInc increases prometheus counter by 1 if query stats is enabled.
func (p *prog) WithLabelValuesInc(c *prometheus.CounterVec, lvs ...string) {
if p.cfg.Service.MetricsQueryStats {
c.WithLabelValues(lvs...).Inc()
}
}