////////////////////////////////////////////////////////////////////////// // DNS Metrics ////////////////////////////////////////////////////////////////////////// package main ////////////////////////////////////////////////////////////////////////// import ( "encoding/json" "errors" // "fmt" dns "github.com/miekg/dns" "github.com/prometheus/client_golang/prometheus" log "github.com/sirupsen/logrus" "io/ioutil" "math" "net/http" "strconv" "strings" "sync" "time" ) ////////////////////////////////////////////////////////////////////////// // data structures // for holding the DNS metrics type DNSMetrics struct { soa *prometheus.GaugeVec rtt *prometheus.GaugeVec valid *prometheus.GaugeVec stime *prometheus.GaugeVec label_map []prometheus.Labels } // structure for specifying DNS servers type DNSServer struct { role string owner string name string ip uint8 addr string soa uint32 } // structure for returning relevant DNS data type DNSResult struct { serial uint32 rtt float64 nsid string } // data structures for querying the current commit metric type DNSCommitMetrics struct { match *prometheus.GaugeVec lastUpdate time.Time mismatch bool since time.Time labelInvalid prometheus.Labels labelUpdate prometheus.Labels } type DNSExplorerCommit struct { Commit string } type DNSMasterCommit struct { Type string `json:"type"` URI string `json:"uri"` Branch string `json:"branch"` Commit string `json:"commit"` } ////////////////////////////////////////////////////////////////////////// // hardcoded :( list of DNS servers to query var dns_servers = []*DNSServer{ // master &DNSServer{"master", "burble", "b.master.delegation-servers.dn42", 6, "[fd42:180:3de0:30::1]:53", 0}, &DNSServer{"master", "jrb0001", "j.master.delegation-servers.dn42", 6, "[fd42:180:3de0:10:5054:ff:fe87:ea39]:53", 0}, // anycast &DNSServer{"recursive", "anycast", "a0.recursive-servers.dn42", 4, "172.20.0.53:53", 0}, &DNSServer{"recursive", "anycast", "a0.recursive-servers.dn42", 6, "[fd42:d42:d42:54::1]:53", 0}, &DNSServer{"recursive", "anycast", "a3.recursive-servers.dn42", 4, "172.23.0.53:53", 0}, &DNSServer{"recursive", "anycast", "a3.recursive-servers.dn42", 6, "[fd42:d42:d42:53::1]:53", 0}, // burble &DNSServer{"delegation", "burble", "b.delegation-servers.dn42", 4, "172.20.129.1:53", 0}, &DNSServer{"delegation", "burble", "b.delegation-servers.dn42", 6, "[fd42:4242:2601:ac53::1]:53", 0}, &DNSServer{"recursive", "burble", "b.recursive-servers.dn42", 4, "172.20.129.2:53", 0}, &DNSServer{"recursive", "burble", "b.recursive-servers.dn42", 6, "[fd42:4242:2601:ac53::53]:53", 0}, &DNSServer{"burble.dn42", "burble", "de-fra1", 6, "[fd42:4242:2601:3102:a25e:b7ff:feea:64ed]:53", 0}, &DNSServer{"burble.dn42", "burble", "ca-bhs2", 6, "[fd42:4242:2601:2d02:a25e:b7ff:feea:64ed]:53", 0}, // jrb0001 &DNSServer{"delegation", "jrb0001", "j.delegation-servers.dn42", 4, "172.20.1.254:53", 0}, &DNSServer{"delegation", "jrb0001", "j.delegation-servers.dn42", 6, "[fd42:5d71:219:0:216:3eff:fe1e:22d6]:53", 0}, &DNSServer{"recursive", "jrb0001", "j.recursive-servers.dn42", 4, "172.20.1.255:53", 0}, &DNSServer{"recursive", "jrb0001", "j.recursive-servers.dn42", 6, "[fd42:5d71:219:0:216:3eff:fee8:c215]:53", 0}, &DNSServer{"jrb0001", "jrb0001", "nl-1", 6, "[fd42:5d71:219:1:216:3eff:fe1e:22d6]:53", 0}, &DNSServer{"jrb0001", "jrb0001", "de-1", 6, "[fd42:5d71:219:2:216:3eff:fe1e:22d6]:53", 0}, &DNSServer{"jrb0001", "jrb0001", "gb-1", 6, "[fd42:5d71:219:3:216:3eff:fe1e:22d6]:53", 0}, &DNSServer{"jrb0001", "jrb0001", "fr-1", 6, "[fd42:5d71:219:4:216:3eff:fe1e:22d6]:53", 0}, &DNSServer{"jrb0001", "jrb0001", "au-1", 6, "[fd42:5d71:219:6:216:3eff:fe1e:22d6]:53", 0}, &DNSServer{"jrb0001", "jrb0001", "jp-1", 6, "[fd42:5d71:219:7:216:3eff:fe1e:22d6]:53", 0}, &DNSServer{"jrb0001", "jrb0001", "sg-1", 6, "[fd42:5d71:219:8:216:3eff:fe1e:22d6]:53", 0}, &DNSServer{"jrb0001", "jrb0001", "ca-1", 6, "[fd42:5d71:219:9:216:3eff:fe1e:22d6]:53", 0}, &DNSServer{"jrb0001", "jrb0001", "us-2", 6, "[fd42:5d71:219:a:216:3eff:fe1e:22d6]:53", 0}, &DNSServer{"jrb0001", "jrb0001", "us-3", 6, "[fd42:5d71:219:b:216:3eff:fe1e:22d6]:53", 0}, &DNSServer{"jrb0001", "jrb0001", "us-4", 6, "[fd42:5d71:219:c:216:3eff:fe1e:22d6]:53", 0}, &DNSServer{"jrb0001", "jrb0001", "us-5", 6, "[fd42:5d71:219:d:216:3eff:fe1e:22d6]:53", 0}, // Kioubit &DNSServer{"delegation", "Kioubit", "k.delegation-servers.dn42", 4, "172.20.14.34:53", 0}, &DNSServer{"delegation", "Kioubit", "k.delegation-servers.dn42", 6, "[fdcf:8538:9ad5:1111::2]:53", 0}, &DNSServer{"recursive", "Kioubit", "k.recursive-servers.dn42", 4, "172.20.14.33:53", 0}, &DNSServer{"recursive", "Kioubit", "k.recursive-servers.dn42", 6, "[fdcf:8538:9ad5:1111::1]:53", 0}, &DNSServer{"Kioubit", "Kioubit", "de2", 6, "[fdcf:8538:9ad5:1112::1]:53", 0}, &DNSServer{"Kioubit", "Kioubit", "us2", 6, "[fdcf:8538:9ad5:1111::5]:53", 0}, &DNSServer{"Kioubit", "Kioubit", "fr1", 6, "[fdcf:8538:9ad5:1112::8]:53", 0}, &DNSServer{"Kioubit", "Kioubit", "us3", 6, "[fdcf:8538:9ad5:1112::9]:53", 0}, &DNSServer{"Kioubit", "Kioubit", "uk1", 6, "[fdcf:8538:9ad5:1112::10]:53", 0}, &DNSServer{"Kioubit", "Kioubit", "hk1", 6, "[fdcf:8538:9ad5:1112::11]:53", 0}, // jlu5 &DNSServer{"recursive", "jlu5", "l.recursive-servers.dn42", 4, "172.22.108.22", 0}, &DNSServer{"recursive", "jlu5", "l.recursive-servers.dn42", 6, "[fd86:bad:11b7:53::2]:53", 0}, &DNSServer{"jlu5", "jlu5", "us-chi01", 6, "[fd86:bad:11b7::1]:53", 0}, &DNSServer{"jlu5", "jlu5", "us-sea02", 6, "[fd86:bad:11b7:1::1]:53", 0}, &DNSServer{"jlu5", "jlu5", "us-atl01", 6, "[fd86:bad:11b7:9::1]:53", 0}, &DNSServer{"jlu5", "jlu5", "us-lax01", 6, "[fd86:bad:11b7:22::1]:53", 0}, &DNSServer{"jlu5", "jlu5", "us-nyc02", 6, "[fd86:bad:11b7:23::1]:53", 0}, &DNSServer{"jlu5", "jlu5", "uk-lon01", 6, "[fd86:bad:11b7:116::1]:53", 0}, &DNSServer{"jlu5", "jlu5", "de-nbg01", 6, "[fd86:bad:11b7:117::1]:53", 0}, &DNSServer{"jlu5", "jlu5", "nl-ams02", 6, "[fd86:bad:11b7:118::1]:53", 0}, &DNSServer{"jlu5", "jlu5", "jp-tyo01", 6, "[fd86:bad:11b7:224::1]:53", 0}, &DNSServer{"jlu5", "jlu5", "au-syd01", 6, "[fd86:bad:11b7:225::1]:53", 0}, &DNSServer{"jlu5", "jlu5", "es-mad01", 6, "[fd86:bad:11b7:120::1]:53", 0}, &DNSServer{"jlu5", "jlu5", "sg01", 6, "[fd86:bad:11b7:227::1]:53", 0}, &DNSServer{"jlu5", "jlu5", "br-sao01", 6, "[fd86:bad:11b7:333::1]:53", 0}, // lantian &DNSServer{"recursive", "lantian", "t.recursive-servers.dn42", 4, "172.22.76.110", 0}, &DNSServer{"recursive", "lantian", "t.recursive-servers.dn42", 6, "[fdbc:f9dc:67ad:2547::53]:53", 0}, &DNSServer{"lantian", "lantian", "gigsgigscloud", 6, "[fdbc:f9dc:67ad:1::53]:53", 0}, &DNSServer{"lantian", "lantian", "virtono", 6, "[fdbc:f9dc:67ad:2::53]:53", 0}, &DNSServer{"lantian", "lantian", "hostdare", 6, "[fdbc:f9dc:67ad:3::53]:53", 0}, &DNSServer{"lantian", "lantian", "virmach-ny1g", 6, "[fdbc:f9dc:67ad:8::53]:53", 0}, } ////////////////////////////////////////////////////////////////////////// // // DNS Server Metrics // ////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////// // initialisation function to register metrics func (m *DNSMetrics) Register() { m.soa = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "dn42_dns_soa", Help: "SOA for .dn42 domain", }, []string{"role", "owner", "name", "ip", "addr"}) prometheus.MustRegister(m.soa) m.rtt = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "dn42_dns_rtt", Help: "RTT when collecting SOA for .dn42 domain", }, []string{"role", "owner", "name", "ip", "addr"}) prometheus.MustRegister(m.rtt) m.valid = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "dn42_dns_valid", Help: "0 = response and latest serial, 1 = response and matching serial, " + "2 = response but incorrect serial, 3 = server error", }, []string{"role", "owner", "name", "ip", "addr"}) prometheus.MustRegister(m.valid) m.stime = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "dn42_dns_stime", Help: "Returns the time between now and the SOA serial number", }, []string{"role", "owner", "name", "ip", "addr"}) prometheus.MustRegister(m.stime) // pre-populate the labels m.label_map = make([]prometheus.Labels, len(dns_servers)) for ix, server := range dns_servers { m.label_map[ix] = prometheus.Labels{ "role": server.role, "owner": server.owner, "name": server.name, "ip": strconv.Itoa(int(server.ip)), "addr": server.addr, } } } ////////////////////////////////////////////////////////////////////////// // collect metrics for all DNS servers func (m *DNSMetrics) Collect() { // hold the collect results in an array where each // entry corresponds to the dns_servers array results := make([]*DNSResult, len(dns_servers)) // query each server up to 3 times to try and get a result for count := 0; count < 3; count++ { // run each tranche of queries in parallel var wg sync.WaitGroup for ix := 0; ix < len(results); ix++ { if results[ix] == nil { // no result yet, query the server wg.Add(1) go func(i int, s *sync.WaitGroup) { results[i] = dns_servers[i].Query() s.Done() }(ix, &wg) } } // all done wg.Wait() } now := uint64(time.Now().Unix()) // create a list of master servers SOA and // record which one has the latest version var latest_soa uint32 = 0 masters := make([]uint32, 2) for ix, server := range dns_servers { if server.role == "master" { // check that a result was actually received by the master if results[ix] != nil { soa := results[ix].serial masters = append(masters, soa) if soa > latest_soa { latest_soa = soa } } } } // fmt.Printf("latest_soa %d\n", latest_soa) // now go through each result and update the metrics // // valid = 0 - Serial number matches latest master // valid = 1 - Serial number matches one master (but not the latest) // valid = 2 - Serial number doesn't match anything // valid = 3 - An error occured // for ix, r := range results { // assume that an error occured unless told otherwise var valid uint = 3 // check if there was a valid result if r != nil { // update the server SOA from the result dns_servers[ix].soa = r.serial // SOA and RTT are direct metrics returned from the query m.soa.With(m.label_map[ix]).Set(float64(r.serial)) m.rtt.With(m.label_map[ix]).Set(r.rtt) // if the server has a high rtt ( > 500ms), then log an info message if r.rtt > 500 { log.WithFields(log.Fields{ "result": r, "server": dns_servers[ix].name, "ipv": dns_servers[ix].ip, }).Info("DNS Server high RTT") } // check if the SOA is valid // it's an error if the SOA is more than 25 hours old if (now - uint64(r.serial)) < (3600 * 25) { // fmt.Printf("Checking serial: %s = %d\n", dns_servers[ix].name, r.serial) if r.serial == latest_soa { // the SOA matches the current latest SOA valid = 0 } else { // assume no match found valid = 2 // step through each master to see if the SOA matches somewhere for _, soa := range masters { if r.serial == soa { // found one valid = 1 break } } } } // before setting whether the server is valid, calculate the stime // (difference in time between now and the SOA, to allow checking that // it is not stale) // it's possible that the SOA could be in the future if there is a // clock mismatch between monitor and DNS server, in which case this // is flagged as a server error if uint64(r.serial) > now { // server error valid = 3 } else { m.stime.With(m.label_map[ix]).Set(float64(now - uint64(r.serial))) } } // finally set the valid status m.valid.With(m.label_map[ix]).Set(float64(valid)) } } ////////////////////////////////////////////////////////////////////////// // construct the DNS query and send to a server func (s *DNSServer) Query() *DNSResult { // create a new recursive query msg := new(dns.Msg) msg.Id = dns.Id() msg.RecursionDesired = (s.role == "recursive") // query the dn42 root zone SOA msg.Question = []dns.Question{ dns.Question{ Name: "dn42.", Qtype: dns.TypeSOA, Qclass: dns.ClassINET, }, } // add EDNS0 options to also query the service ID (NSID) // pretty much copied verbatim from the library docs // opts := new(dns.OPT) // opts.Hdr.Name = "." // opts.Hdr.Rrtype = dns.TypeOPT // create the NSID option // ns_opt := new(dns.EDNS0_NSID) // ns_opt.Code = dns.EDNS0NSID // ns_opt.Nsid = "" // add the NSID option to the opts RR // opts.Option = append(opts.Option, ns_opt) // then add the opts RR to the query // msg.SetEdns0(4096, false) // msg.Extra = []dns.RR{opts} // create a new DNS client client := new(dns.Client) client.Timeout, _ = time.ParseDuration("4s") // fmt.Printf("Querying: %s\n", s.name) // and finally query the server resp, rtt, err := client.Exchange(msg, s.addr) if err != nil || len(resp.Answer) != 1 { log.WithFields(log.Fields{ "error": err, "resp": resp, "server": s, }).Warn("Failed to query DNS server") return nil } // fmt.Printf("Resp: %v\n", resp) // was an SOA returned ? if soa, ok := resp.Answer[0].(*dns.SOA); !ok { log.WithFields(log.Fields{ "resp": resp, "server": s, }).Warn("DNS response was not an SOA") return nil } else { // got an SOA result result := &DNSResult{ serial: soa.Serial, rtt: math.Round(rtt.Seconds() * 1000), } // did we also get an NSID result ? if opts := resp.IsEdns0(); opts != nil { // response contains an EDNS0 record // check for an NSID entry for _, s := range opts.Option { switch e := s.(type) { case *dns.EDNS0_NSID: result.nsid = e.Nsid } } } return result } } ////////////////////////////////////////////////////////////////////////// // // DNS Commit Metric // // Check the master commit against the current registry commit // (via the explorer) to check that the master is being updated // ////////////////////////////////////////////////////////////////////////// func (m *DNSCommitMetrics) Register() { m.match = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "dn42_dns_commit", Help: "Time since commits last matched (or -1 for error)", }, []string{"metric"}) prometheus.MustRegister(m.match) m.labelInvalid = prometheus.Labels{ "metric": "invalid", } m.labelUpdate = prometheus.Labels{ "metric": "update", } } func (m *DNSCommitMetrics) Collect() { now := time.Now() interval := now.Sub(m.lastUpdate) // only check if it's more than 60 mins since the last successful match if (interval.Hours() >= 1.0) || m.mismatch { // fetch the current commit from the explorer ec, err := m.FetchExplorerCommit() if err != nil { // couldn't fetch the explorer commit value, set the metric to bad m.match.With(m.labelUpdate).Set(-1.0) m.match.With(m.labelInvalid).Set(-1.0) m.mismatch = true return } mc, err := m.FetchMasterCommit() if err != nil { // couldn't fetch master commit value, set the metric to bad m.match.With(m.labelUpdate).Set(-2.0) m.match.With(m.labelInvalid).Set(-1.0) m.mismatch = true return } m.lastUpdate = now m.match.With(m.labelUpdate).Set(0.0) if ec == mc { // if the commits match, null the metric m.match.With(m.labelInvalid).Set(0.0) m.mismatch = false } else { // was this the first mismatch ? if !m.mismatch { m.since = now m.mismatch = true } since := now.Sub(m.since).Seconds() log.WithFields(log.Fields{ "master": mc, "explorer": ec, "since": since, }).Warn("DNS Commit Mismatch") // set the metric to be the interval since last good match m.match.With(m.labelInvalid).Set(float64(since)) } } else { // update time since last check m.match.With(m.labelUpdate).Set(float64(interval.Seconds())) } } ////////////////////////////////////////////////////////////////////////// // fetch the current commit from the explorer func (m *DNSCommitMetrics) FetchExplorerCommit() (string, error) { response, err := http.Get("http://dn42regsrv.burble.dn42:8042/api/registry/.meta") if err != nil { log.WithFields(log.Fields{ "error": err, }).Warn("Unable to query registry explorer") return "", err } data, err := ioutil.ReadAll(response.Body) if err != nil { log.WithFields(log.Fields{ "error": err, }).Warn("Unable read explorer response") return "", err } var ec DNSExplorerCommit if err := json.Unmarshal(data, &ec); err != nil { log.WithFields(log.Fields{ "error": err, }).Warn("Unable to parse explorer JSON") return "", err } return strings.ToLower(ec.Commit), nil } ////////////////////////////////////////////////////////////////////////// // fetch the current commit from the master func (m *DNSCommitMetrics) FetchMasterCommit() (string, error) { // JRB0001's master // response, err := // http.Get("http://[fd42:180:3de0:10:5054:ff:fe87:ea39]:8080/api/git-db-state") // burble.dn42 master response, err := http.Get("http://[fd42:180:3de0:30::1]:8080/api/git-db-state") if err != nil { log.WithFields(log.Fields{ "error": err, }).Warn("Unable to query registry master") return "", err } data, err := ioutil.ReadAll(response.Body) if err != nil { log.WithFields(log.Fields{ "error": err, }).Warn("Unable read master response") return "", err } var mc []DNSMasterCommit if err := json.Unmarshal(data, &mc); err != nil { log.WithFields(log.Fields{ "error": err, }).Warn("Unable to parse master JSON") return "", err } for _, c := range mc { if strings.HasSuffix(c.URI, "registry.git") { return strings.ToLower(c.Commit), nil } } log.WithFields(log.Fields{ "MasterCommit": mc, }).Warn("Unable to find registry commit from master") return "", errors.New("Unable to find registry commit from master") } ////////////////////////////////////////////////////////////////////////// // end of code