Refactor SOA checking, parallel DNS queries

This commit is contained in:
Simon Marsh 2019-06-29 10:52:40 +01:00
parent cb7a98b42b
commit 7b7d9965b9
Signed by: burble
GPG Key ID: 7B9FE8780CFB6593
2 changed files with 76 additions and 54 deletions

View File

@ -1,29 +0,0 @@
##########################################################################
# dn42promsrv example systemd service file
##########################################################################
[Unit]
Description=DN42 Prometheus Stats Server
After=network.target
[Install]
WantedBy=multi-user.target
[Service]
User=promsrv
Group=promsrv
Type=simple
Restart=on-failure
# service hardening
ProtectSystem=strict
NoNewPrivileges=yes
ProtectControlGroups=yes
PrivateTmp=yes
PrivateDevices=yes
DevicePolicy=closed
MemoryDenyWriteExecute=yes
#
ExecStart=/usr/local/bin/dn42promsrv
#########################################################################
# end of file

101
dns.go
View File

@ -18,6 +18,7 @@ import (
"net/http"
"strconv"
"strings"
"sync"
"time"
)
@ -174,7 +175,8 @@ func (m *DNSMetrics) Register() {
m.valid = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "dn42_dns_valid",
Help: "0 = response and correct serial, 1 = response but incorrect serial, 2 = no response",
Help: "0 = response and latest serial, 1 = response and matching serial, " +
"2 = response but incorrect serial, 3 = server error",
}, []string{"role", "owner", "name", "ip", "addr"})
prometheus.MustRegister(m.valid)
@ -204,44 +206,70 @@ func (m *DNSMetrics) Register() {
func (m *DNSMetrics) Collect() {
now := uint64(time.Now().Unix())
// search the masters for the highest SOA
var latest_soa uint32 = 0
for _, server := range dns_servers {
if server.role == "master" {
if server.soa > latest_soa {
latest_soa = server.soa
}
}
}
// hold the results in an array where each entry corresponds
// to the dns_servers array
// hold the collect results in an array where each
// entry corresponds to the dns_servers array
results := make([]*DNSResult, len(dns_servers))
// query each server up to 3 times to try and get a result
for count := 0; count < 3; count++ {
// run each tranche of queries in parallel
var wg sync.WaitGroup
for ix := 0; ix < len(results); ix++ {
if results[ix] == nil {
// no result yet, query the server
results[ix] = dns_servers[ix].Query()
wg.Add(1)
go func(i int, s *sync.WaitGroup) {
results[i] = dns_servers[i].Query()
s.Done()
}(ix, &wg)
}
}
// all done
wg.Wait()
}
now := uint64(time.Now().Unix())
// create a list of master servers SOA and
// record which one has the latest version
var latest_soa uint32 = 0
masters := make([]uint32, 2)
for ix, server := range dns_servers {
if server.role == "master" {
soa := results[ix].serial
masters = append(masters, soa)
if soa > latest_soa {
latest_soa = soa
}
}
}
// fmt.Printf("latest_soa %d\n", latest_soa)
// now go through each result and update the metrics
//
// valid = 0 - Serial number matches latest master
// valid = 1 - Serial number matches one master (but not the latest)
// valid = 2 - Serial number doesn't match anything
// valid = 3 - An error occured
//
for ix, r := range results {
var valid uint = 2
// assume that an error occured unless told otherwise
var valid uint = 3
// check if there was a valid result
if r != nil {
// update the server SOA
// update the server SOA from the result
dns_servers[ix].soa = r.serial
// SOA and RTT are direct metrics returned from the query
@ -257,11 +285,32 @@ func (m *DNSMetrics) Collect() {
}).Info("DNS Server high RTT")
}
// check if the SOA matches the lastest master SOA
if r.serial == latest_soa {
valid = 0
} else {
valid = 1
// check if the SOA is valid
// it's an error if the SOA is more than 25 hours old
if (now - uint64(r.serial)) < (3600 * 25) {
// fmt.Printf("Checking serial: %s = %d\n", dns_servers[ix].name, r.serial)
if r.serial == latest_soa {
// the SOA matches the current latest SOA
valid = 0
} else {
// assume no match found
valid = 2
// step through each master to see if the SOA matches somewhere
for _, soa := range masters {
if r.serial == soa {
// found one
valid = 1
break
}
}
}
}
// before setting whether the server is valid, calculate the stime
@ -273,7 +322,7 @@ func (m *DNSMetrics) Collect() {
if uint64(r.serial) > now {
// server error
valid = 2
valid = 3
} else {
m.stime.With(m.label_map[ix]).Set(float64(now - uint64(r.serial)))
}
@ -324,6 +373,8 @@ func (s *DNSServer) Query() *DNSResult {
client := new(dns.Client)
client.Timeout, _ = time.ParseDuration("4s")
// fmt.Printf("Querying: %s\n", s.name)
// and finally query the server
resp, rtt, err := client.Exchange(msg, s.addr)
if err != nil || len(resp.Answer) != 1 {