dn42promsrv/dns.go
Simon Marsh 599a2ab920
All checks were successful
continuous-integration/drone/push Build is passing
add lantian, jlu5 nodes, remove jrb0001 nodes
2021-10-02 10:42:03 +01:00

652 lines
18 KiB
Go

//////////////////////////////////////////////////////////////////////////
// DNS Metrics
//////////////////////////////////////////////////////////////////////////
package main
//////////////////////////////////////////////////////////////////////////
import (
"encoding/json"
"errors"
// "fmt"
dns "github.com/miekg/dns"
"github.com/prometheus/client_golang/prometheus"
log "github.com/sirupsen/logrus"
"io/ioutil"
"math"
"net/http"
"strconv"
"strings"
"sync"
"time"
)
//////////////////////////////////////////////////////////////////////////
// data structures
// for holding the DNS metrics
type DNSMetrics struct {
soa *prometheus.GaugeVec
rtt *prometheus.GaugeVec
valid *prometheus.GaugeVec
stime *prometheus.GaugeVec
label_map []prometheus.Labels
}
// structure for specifying DNS servers
type DNSServer struct {
role string
owner string
name string
ip uint8
addr string
soa uint32
}
// structure for returning relevant DNS data
type DNSResult struct {
serial uint32
rtt float64
nsid string
}
// data structures for querying the current commit metric
type DNSCommitMetrics struct {
match *prometheus.GaugeVec
lastUpdate time.Time
mismatch bool
since time.Time
labelInvalid prometheus.Labels
labelUpdate prometheus.Labels
}
type DNSExplorerCommit struct {
Commit string
}
type DNSMasterCommit struct {
Type string `json:"type"`
URI string `json:"uri"`
Branch string `json:"branch"`
Commit string `json:"commit"`
}
//////////////////////////////////////////////////////////////////////////
// hardcoded :( list of DNS servers to query
var dns_servers = []*DNSServer{
// master
&DNSServer{"master", "burble", "b.master.delegation-servers.dn42", 6,
"[fd42:180:3de0:30::1]:53", 0},
&DNSServer{"master", "jrb0001", "j.master.delegation-servers.dn42", 6,
"[fd42:180:3de0:10:5054:ff:fe87:ea39]:53", 0},
// anycast
&DNSServer{"recursive", "anycast", "a0.recursive-servers.dn42", 4,
"172.20.0.53:53", 0},
&DNSServer{"recursive", "anycast", "a0.recursive-servers.dn42", 6,
"[fd42:d42:d42:54::1]:53", 0},
&DNSServer{"recursive", "anycast", "a3.recursive-servers.dn42", 4,
"172.23.0.53:53", 0},
&DNSServer{"recursive", "anycast", "a3.recursive-servers.dn42", 6,
"[fd42:d42:d42:53::1]:53", 0},
// burble
&DNSServer{"delegation", "burble", "b.delegation-servers.dn42", 4,
"172.20.129.1:53", 0},
&DNSServer{"delegation", "burble", "b.delegation-servers.dn42", 6,
"[fd42:4242:2601:ac53::1]:53", 0},
&DNSServer{"recursive", "burble", "b.recursive-servers.dn42", 4,
"172.20.129.2:53", 0},
&DNSServer{"recursive", "burble", "b.recursive-servers.dn42", 6,
"[fd42:4242:2601:ac53::53]:53", 0},
&DNSServer{"burble.dn42", "burble", "de-fra1", 6,
"[fd42:4242:2601:3102:a25e:b7ff:feea:64ed]:53", 0},
&DNSServer{"burble.dn42", "burble", "us-dal3", 6,
"[fd42:4242:2601:2a02:a25e:b7ff:feea:64ed]:53", 0},
&DNSServer{"burble.dn42", "burble", "sg-sin2", 6,
"[fd42:4242:2601:3702:a25e:b7ff:feea:64ed]:53", 0},
&DNSServer{"burble.dn42", "burble", "ca-bhs2", 6,
"[fd42:4242:2601:2d02:a25e:b7ff:feea:64ed]:53", 0},
&DNSServer{"burble.dn42", "burble", "us-lax1", 6,
"[fd42:4242:2601:3a02:a25e:b7ff:feea:64ed]:53", 0},
// jrb0001
&DNSServer{"delegation", "jrb0001", "j.delegation-servers.dn42", 4,
"172.20.1.254:53", 0},
&DNSServer{"delegation", "jrb0001", "j.delegation-servers.dn42", 6,
"[fd42:5d71:219:0:216:3eff:fe1e:22d6]:53", 0},
&DNSServer{"recursive", "jrb0001", "j.recursive-servers.dn42", 4,
"172.20.1.255:53", 0},
&DNSServer{"recursive", "jrb0001", "j.recursive-servers.dn42", 6,
"[fd42:5d71:219:0:216:3eff:fee8:c215]:53", 0},
&DNSServer{"jrb0001", "jrb0001", "nl-1", 6, "[fd42:5d71:219:1:216:3eff:fe1e:22d6]:53", 0},
&DNSServer{"jrb0001", "jrb0001", "de-1", 6, "[fd42:5d71:219:2:216:3eff:fe1e:22d6]:53", 0},
&DNSServer{"jrb0001", "jrb0001", "gb-1", 6, "[fd42:5d71:219:3:216:3eff:fe1e:22d6]:53", 0},
&DNSServer{"jrb0001", "jrb0001", "fr-1", 6, "[fd42:5d71:219:4:216:3eff:fe1e:22d6]:53", 0},
&DNSServer{"jrb0001", "jrb0001", "au-1", 6, "[fd42:5d71:219:6:216:3eff:fe1e:22d6]:53", 0},
&DNSServer{"jrb0001", "jrb0001", "jp-1", 6, "[fd42:5d71:219:7:216:3eff:fe1e:22d6]:53", 0},
&DNSServer{"jrb0001", "jrb0001", "sg-1", 6, "[fd42:5d71:219:8:216:3eff:fe1e:22d6]:53", 0},
&DNSServer{"jrb0001", "jrb0001", "ca-1", 6, "[fd42:5d71:219:9:216:3eff:fe1e:22d6]:53", 0},
&DNSServer{"jrb0001", "jrb0001", "us-2", 6, "[fd42:5d71:219:a:216:3eff:fe1e:22d6]:53", 0},
&DNSServer{"jrb0001", "jrb0001", "us-3", 6, "[fd42:5d71:219:b:216:3eff:fe1e:22d6]:53", 0},
&DNSServer{"jrb0001", "jrb0001", "us-4", 6, "[fd42:5d71:219:c:216:3eff:fe1e:22d6]:53", 0},
&DNSServer{"jrb0001", "jrb0001", "us-5", 6, "[fd42:5d71:219:d:216:3eff:fe1e:22d6]:53", 0},
// Kioubit
&DNSServer{"delegation", "Kioubit", "k.delegation-servers.dn42", 4,
"172.20.14.34:53", 0},
&DNSServer{"delegation", "Kioubit", "k.delegation-servers.dn42", 6,
"[fdcf:8538:9ad5:1111::2]:53", 0},
&DNSServer{"recursive", "Kioubit", "k.recursive-servers.dn42", 4,
"172.20.14.33:53", 0},
&DNSServer{"recursive", "Kioubit", "k.recursive-servers.dn42", 6,
"[fdcf:8538:9ad5:1111::1]:53", 0},
&DNSServer{"Kioubit", "Kioubit", "de2", 6, "[fdcf:8538:9ad5:1112::1]:53", 0},
&DNSServer{"Kioubit", "Kioubit", "us2", 6, "[fdcf:8538:9ad5:1112::5]:53", 0},
&DNSServer{"Kioubit", "Kioubit", "fr1", 6, "[fdcf:8538:9ad5:1112::8]:53", 0},
&DNSServer{"Kioubit", "Kioubit", "us3", 6, "[fdcf:8538:9ad5:1112::9]:53", 0},
&DNSServer{"Kioubit", "Kioubit", "uk1", 6, "[fdcf:8538:9ad5:1112::10]:53", 0},
&DNSServer{"Kioubit", "Kioubit", "hk1", 6, "[fdcf:8538:9ad5:1112::11]:53", 0},
// jlu5
&DNSServer{"recursive", "jlu5", "l.recursive-servers.dn42", 4,
"172.22.108.22", 0},
&DNSServer{"recursive", "jlu5", "l.recursive-servers.dn42", 6,
"[fd86:bad:11b7:53::2]:53", 0},
&DNSServer{"jlu5", "jlu5", "us-chi01", 6, "[fd86:bad:11b7::1]:53", 0},
&DNSServer{"jlu5", "jlu5", "us-sea02", 6, "[fd86:bad:11b7:1::1]:53", 0},
&DNSServer{"jlu5", "jlu5", "us-atl01", 6, "[fd86:bad:11b7:9::1]:53", 0},
&DNSServer{"jlu5", "jlu5", "us-lax01", 6, "[fd86:bad:11b7:22::1]:53", 0},
&DNSServer{"jlu5", "jlu5", "us-nyc02", 6, "[fd86:bad:11b7:23::1]:53", 0},
&DNSServer{"jlu5", "jlu5", "uk-lon01", 6, "[fd86:bad:11b7:116::1]:53", 0},
&DNSServer{"jlu5", "jlu5", "de-nbg01", 6, "[fd86:bad:11b7:117::1]:53", 0},
&DNSServer{"jlu5", "jlu5", "nl-ams02", 6, "[fd86:bad:11b7:118::1]:53", 0},
&DNSServer{"jlu5", "jlu5", "jp-tyo01", 6, "[fd86:bad:11b7:224::1]:53", 0},
&DNSServer{"jlu5", "jlu5", "au-syd01", 6, "[fd86:bad:11b7:225::1]:53", 0},
&DNSServer{"jlu5", "jlu5", "es-mad01", 6, "[fd86:bad:11b7:120::1]:53", 0},
&DNSServer{"jlu5", "jlu5", "sg01", 6, "[fd86:bad:11b7:227::1]:53", 0},
&DNSServer{"jlu5", "jlu5", "br-sao01", 6, "[fd86:bad:11b7:333::1]:53", 0},
// lantian
&DNSServer{"recursive", "lantian", "t.recursive-servers.dn42", 4,
"172.22.76.110", 0},
&DNSServer{"recursive", "lantian", "t.recursive-servers.dn42", 6,
"[fdbc:f9dc:67ad:2547::53]:53", 0},
&DNSServer{"lantian", "lantian", "gigsgigscloud", 6, "[fdbc:f9dc:67ad:1::53]:53", 0},
&DNSServer{"lantian", "lantian", "virtono", 6, "[fdbc:f9dc:67ad:2::53]:53", 0},
&DNSServer{"lantian", "lantian", "hostdare", 6, "[fdbc:f9dc:67ad:3::53]:53", 0},
&DNSServer{"lantian", "lantian", "virmach-ny1g", 6, "[fdbc:f9dc:67ad:8::53]:53", 0},
}
//////////////////////////////////////////////////////////////////////////
//
// DNS Server Metrics
//
//////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////
// initialisation function to register metrics
func (m *DNSMetrics) Register() {
m.soa = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "dn42_dns_soa",
Help: "SOA for .dn42 domain",
}, []string{"role", "owner", "name", "ip", "addr"})
prometheus.MustRegister(m.soa)
m.rtt = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "dn42_dns_rtt",
Help: "RTT when collecting SOA for .dn42 domain",
}, []string{"role", "owner", "name", "ip", "addr"})
prometheus.MustRegister(m.rtt)
m.valid = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "dn42_dns_valid",
Help: "0 = response and latest serial, 1 = response and matching serial, " +
"2 = response but incorrect serial, 3 = server error",
}, []string{"role", "owner", "name", "ip", "addr"})
prometheus.MustRegister(m.valid)
m.stime = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "dn42_dns_stime",
Help: "Returns the time between now and the SOA serial number",
}, []string{"role", "owner", "name", "ip", "addr"})
prometheus.MustRegister(m.stime)
// pre-populate the labels
m.label_map = make([]prometheus.Labels, len(dns_servers))
for ix, server := range dns_servers {
m.label_map[ix] = prometheus.Labels{
"role": server.role,
"owner": server.owner,
"name": server.name,
"ip": strconv.Itoa(int(server.ip)),
"addr": server.addr,
}
}
}
//////////////////////////////////////////////////////////////////////////
// collect metrics for all DNS servers
func (m *DNSMetrics) Collect() {
// hold the collect results in an array where each
// entry corresponds to the dns_servers array
results := make([]*DNSResult, len(dns_servers))
// query each server up to 3 times to try and get a result
for count := 0; count < 3; count++ {
// run each tranche of queries in parallel
var wg sync.WaitGroup
for ix := 0; ix < len(results); ix++ {
if results[ix] == nil {
// no result yet, query the server
wg.Add(1)
go func(i int, s *sync.WaitGroup) {
results[i] = dns_servers[i].Query()
s.Done()
}(ix, &wg)
}
}
// all done
wg.Wait()
}
now := uint64(time.Now().Unix())
// create a list of master servers SOA and
// record which one has the latest version
var latest_soa uint32 = 0
masters := make([]uint32, 2)
for ix, server := range dns_servers {
if server.role == "master" {
// check that a result was actually received by the master
if results[ix] != nil {
soa := results[ix].serial
masters = append(masters, soa)
if soa > latest_soa {
latest_soa = soa
}
}
}
}
// fmt.Printf("latest_soa %d\n", latest_soa)
// now go through each result and update the metrics
//
// valid = 0 - Serial number matches latest master
// valid = 1 - Serial number matches one master (but not the latest)
// valid = 2 - Serial number doesn't match anything
// valid = 3 - An error occured
//
for ix, r := range results {
// assume that an error occured unless told otherwise
var valid uint = 3
// check if there was a valid result
if r != nil {
// update the server SOA from the result
dns_servers[ix].soa = r.serial
// SOA and RTT are direct metrics returned from the query
m.soa.With(m.label_map[ix]).Set(float64(r.serial))
m.rtt.With(m.label_map[ix]).Set(r.rtt)
// if the server has a high rtt ( > 500ms), then log an info message
if r.rtt > 500 {
log.WithFields(log.Fields{
"result": r,
"server": dns_servers[ix].name,
"ipv": dns_servers[ix].ip,
}).Info("DNS Server high RTT")
}
// check if the SOA is valid
// it's an error if the SOA is more than 25 hours old
if (now - uint64(r.serial)) < (3600 * 25) {
// fmt.Printf("Checking serial: %s = %d\n", dns_servers[ix].name, r.serial)
if r.serial == latest_soa {
// the SOA matches the current latest SOA
valid = 0
} else {
// assume no match found
valid = 2
// step through each master to see if the SOA matches somewhere
for _, soa := range masters {
if r.serial == soa {
// found one
valid = 1
break
}
}
}
}
// before setting whether the server is valid, calculate the stime
// (difference in time between now and the SOA, to allow checking that
// it is not stale)
// it's possible that the SOA could be in the future if there is a
// clock mismatch between monitor and DNS server, in which case this
// is flagged as a server error
if uint64(r.serial) > now {
// server error
valid = 3
} else {
m.stime.With(m.label_map[ix]).Set(float64(now - uint64(r.serial)))
}
}
// finally set the valid status
m.valid.With(m.label_map[ix]).Set(float64(valid))
}
}
//////////////////////////////////////////////////////////////////////////
// construct the DNS query and send to a server
func (s *DNSServer) Query() *DNSResult {
// create a new recursive query
msg := new(dns.Msg)
msg.Id = dns.Id()
msg.RecursionDesired = (s.role == "recursive")
// query the dn42 root zone SOA
msg.Question = []dns.Question{
{"dn42.", dns.TypeSOA, dns.ClassINET},
}
// add EDNS0 options to also query the service ID (NSID)
// pretty much copied verbatim from the library docs
// opts := new(dns.OPT)
// opts.Hdr.Name = "."
// opts.Hdr.Rrtype = dns.TypeOPT
// create the NSID option
// ns_opt := new(dns.EDNS0_NSID)
// ns_opt.Code = dns.EDNS0NSID
// ns_opt.Nsid = ""
// add the NSID option to the opts RR
// opts.Option = append(opts.Option, ns_opt)
// then add the opts RR to the query
// msg.SetEdns0(4096, false)
// msg.Extra = []dns.RR{opts}
// create a new DNS client
client := new(dns.Client)
client.Timeout, _ = time.ParseDuration("4s")
// fmt.Printf("Querying: %s\n", s.name)
// and finally query the server
resp, rtt, err := client.Exchange(msg, s.addr)
if err != nil || len(resp.Answer) != 1 {
log.WithFields(log.Fields{
"error": err,
"resp": resp,
"server": s,
}).Warn("Failed to query DNS server")
return nil
}
// fmt.Printf("Resp: %v\n", resp)
// was an SOA returned ?
if soa, ok := resp.Answer[0].(*dns.SOA); !ok {
log.WithFields(log.Fields{
"resp": resp,
"server": s,
}).Warn("DNS response was not an SOA")
return nil
} else {
// got an SOA result
result := &DNSResult{
serial: soa.Serial,
rtt: math.Round(rtt.Seconds() * 1000),
}
// did we also get an NSID result ?
if opts := resp.IsEdns0(); opts != nil {
// response contains an EDNS0 record
// check for an NSID entry
for _, s := range opts.Option {
switch e := s.(type) {
case *dns.EDNS0_NSID:
result.nsid = e.Nsid
}
}
}
return result
}
}
//////////////////////////////////////////////////////////////////////////
//
// DNS Commit Metric
//
// Check the master commit against the current registry commit
// (via the explorer) to check that the master is being updated
//
//////////////////////////////////////////////////////////////////////////
func (m *DNSCommitMetrics) Register() {
m.match = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "dn42_dns_commit",
Help: "Time since commits last matched (or -1 for error)",
}, []string{"metric"})
prometheus.MustRegister(m.match)
m.labelInvalid = prometheus.Labels{
"metric": "invalid",
}
m.labelUpdate = prometheus.Labels{
"metric": "update",
}
}
func (m *DNSCommitMetrics) Collect() {
now := time.Now()
interval := now.Sub(m.lastUpdate)
// only check if it's more than 60 mins since the last successful match
if (interval.Hours() >= 1.0) || m.mismatch {
// fetch the current commit from the explorer
ec, err := m.FetchExplorerCommit()
if err != nil {
// couldn't fetch the explorer commit value, set the metric to bad
m.match.With(m.labelUpdate).Set(-1.0)
m.match.With(m.labelInvalid).Set(-1.0)
m.mismatch = true
return
}
mc, err := m.FetchMasterCommit()
if err != nil {
// couldn't fetch master commit value, set the metric to bad
m.match.With(m.labelUpdate).Set(-2.0)
m.match.With(m.labelInvalid).Set(-1.0)
m.mismatch = true
return
}
m.lastUpdate = now
m.match.With(m.labelUpdate).Set(0.0)
if ec == mc {
// if the commits match, null the metric
m.match.With(m.labelInvalid).Set(0.0)
m.mismatch = false
} else {
// was this the first mismatch ?
if !m.mismatch {
m.since = now
m.mismatch = true
}
since := now.Sub(m.since).Seconds()
log.WithFields(log.Fields{
"master": mc,
"explorer": ec,
"since": since,
}).Warn("DNS Commit Mismatch")
// set the metric to be the interval since last good match
m.match.With(m.labelInvalid).Set(float64(since))
}
} else {
// update time since last check
m.match.With(m.labelUpdate).Set(float64(interval.Seconds()))
}
}
//////////////////////////////////////////////////////////////////////////
// fetch the current commit from the explorer
func (m *DNSCommitMetrics) FetchExplorerCommit() (string, error) {
response, err := http.Get("http://dn42regsrv.burble.dn42:8042/api/registry/.meta")
if err != nil {
log.WithFields(log.Fields{
"error": err,
}).Warn("Unable to query registry explorer")
return "", err
}
data, err := ioutil.ReadAll(response.Body)
if err != nil {
log.WithFields(log.Fields{
"error": err,
}).Warn("Unable read explorer response")
return "", err
}
var ec DNSExplorerCommit
if err := json.Unmarshal(data, &ec); err != nil {
log.WithFields(log.Fields{
"error": err,
}).Warn("Unable to parse explorer JSON")
return "", err
}
return strings.ToLower(ec.Commit), nil
}
//////////////////////////////////////////////////////////////////////////
// fetch the current commit from the master
func (m *DNSCommitMetrics) FetchMasterCommit() (string, error) {
// JRB0001's master
// response, err :=
// http.Get("http://[fd42:180:3de0:10:5054:ff:fe87:ea39]:8080/api/git-db-state")
// burble.dn42 master
response, err :=
http.Get("http://[fd42:180:3de0:30::1]:8080/api/git-db-state")
if err != nil {
log.WithFields(log.Fields{
"error": err,
}).Warn("Unable to query registry master")
return "", err
}
data, err := ioutil.ReadAll(response.Body)
if err != nil {
log.WithFields(log.Fields{
"error": err,
}).Warn("Unable read master response")
return "", err
}
var mc []DNSMasterCommit
if err := json.Unmarshal(data, &mc); err != nil {
log.WithFields(log.Fields{
"error": err,
}).Warn("Unable to parse master JSON")
return "", err
}
for _, c := range mc {
if strings.HasSuffix(c.URI, "registry.git") {
return strings.ToLower(c.Commit), nil
}
}
log.WithFields(log.Fields{
"MasterCommit": mc,
}).Warn("Unable to find registry commit from master")
return "", errors.New("Unable to find registry commit from master")
}
//////////////////////////////////////////////////////////////////////////
// end of code