Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
172 changes: 119 additions & 53 deletions cmd/cert-checker/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,14 @@ import (
"bytes"
"context"
"crypto/x509"
"encoding/json"
"flag"
"fmt"
"net"
"net/netip"
"os"
"regexp"
"slices"
"strings"
"sync"
"sync/atomic"
"time"
Expand Down Expand Up @@ -38,6 +39,36 @@ import (
"github.com/letsencrypt/boulder/sa"
)

type certCheckerMetrics struct {
checkerLatency prometheus.Histogram
checkerTimestamp prometheus.Gauge
checkerGoodCount prometheus.Gauge
checkerBadCount prometheus.Gauge
}

func NewCertCheckerMetrics(stats prometheus.Registerer) *certCheckerMetrics {
checkerLatency := promauto.With(stats).NewHistogram(prometheus.HistogramOpts{
Name: "cert_checker_latency",
Help: "Histogram of latencies a cert-checker worker takes to complete a batch",
})

checkerTimestamp := promauto.With(stats).NewGauge(prometheus.GaugeOpts{
Name: "cert_checker_last_run_timestamp",
Help: "Timestamp of cert-checker's last run",
})

checkerGoodCount := promauto.With(stats).NewGauge(prometheus.GaugeOpts{
Name: "cert_checker_good_count",
Help: "Cert-checker count of good certificates",
})

checkerBadCount := promauto.With(stats).NewGauge(prometheus.GaugeOpts{
Name: "cert_checker_bad_count",
Help: "Cert-checker count of bad certificates",
})
return &certCheckerMetrics{checkerLatency, checkerTimestamp, checkerGoodCount, checkerBadCount}
}

// For defense-in-depth in addition to using the PA & its identPolicy to check
// domain names we also perform a check against the regex's from the
// forbiddenDomains array
Expand All @@ -62,25 +93,9 @@ var batchSize = 1000
type report struct {
begin time.Time
end time.Time
GoodCerts int64 `json:"good-certs"`
BadCerts int64 `json:"bad-certs"`
DbErrs int64 `json:"db-errs"`
Entries map[string]reportEntry `json:"entries"`
}

func (r *report) dump() error {
content, err := json.MarshalIndent(r, "", " ")
if err != nil {
return err
}
fmt.Fprintln(os.Stdout, string(content))
return nil
}

type reportEntry struct {
Valid bool `json:"valid"`
SANs []string `json:"sans"`
Problems []string `json:"problems,omitempty"`
GoodCerts int64 `json:"good-certs"`
BadCerts int64 `json:"bad-certs"`
DbErrs int64 `json:"db-errs"`
}

// certDB is an interface collecting the borp.DbMap functions that the various
Expand Down Expand Up @@ -134,7 +149,6 @@ func newChecker(saDbMap certDB,
certs: make(chan *corepb.Certificate, batchSize),
rMu: new(sync.Mutex),
clock: clk,
issuedReport: report{Entries: make(map[string]reportEntry)},
checkPeriod: period,
acceptableValidityDurations: avd,
lints: lints,
Expand Down Expand Up @@ -265,26 +279,17 @@ func (c *certChecker) getCerts(ctx context.Context) error {
return nil
}

func (c *certChecker) processCerts(ctx context.Context, wg *sync.WaitGroup, badResultsOnly bool) {
func (c *certChecker) processCerts(ctx context.Context) {
for cert := range c.certs {
sans, problems := c.checkCert(ctx, cert)
valid := len(problems) == 0
c.rMu.Lock()
if !badResultsOnly || (badResultsOnly && !valid) {
c.issuedReport.Entries[cert.Serial] = reportEntry{
Valid: valid,
SANs: sans,
Problems: problems,
}
}
c.rMu.Unlock()
if !valid {
atomic.AddInt64(&c.issuedReport.BadCerts, 1)
c.logger.AuditErr("certificate error found", nil, map[string]any{"serial": cert.Serial, "sans": sans, "problems": problems})
Comment thread
aarongable marked this conversation as resolved.
} else {
atomic.AddInt64(&c.issuedReport.GoodCerts, 1)
}
}
wg.Done()
}

// Extensions that we allow in certificates
Expand Down Expand Up @@ -540,8 +545,19 @@ type Config struct {
cmd.HostnamePolicyConfig

Workers int `validate:"required,min=1"`
// Deprecated: this is ignored, and cert checker always checks both expired and unexpired.
UnexpiredOnly bool
// LookupDNSAuthority can only be specified with PushgatewayService. It's a single
// <hostname|IPv4|[IPv6]>:<port> of the DNS server to be used for resolution
// of pushgateway backends. If the address contains a hostname it will be resolved
// using system DNS. If the address contains a port, the client will use it
// directly, otherwise port 53 is used.
LookupDNSAuthority string `validate:"excluded_without=PushgatewayService,required_with=PushgatewayService,omitempty,ip|hostname|hostname_port"`
// PushgatewayService entry contains a service and domain name that will be used
// to construct a SRV DNS query to lookup pushgateway backends. For example: if
// the resource record is 'foo.service.consul', then the 'Service' is 'foo'
// and the 'Domain' is 'service.consul'. The expected dNSName to be
// authenticated in the server certificate would be 'foo.service.consul'.
PushgatewayService *cmd.ServiceDomain `validate:"required_with=LookupDNSAuthority"`
// Deprecated: cert-checker only logs bad results anyway.
BadResultsOnly bool
CheckPeriod config.Duration

Expand Down Expand Up @@ -577,6 +593,47 @@ type Config struct {
Syslog cmd.SyslogConfig
}

// getPushgatewayURL resolves svc via SRV+A lookups against dnsAuthority and
// returns an http:// URL whose host is an IP address. Both lookups go through
// dnsAuthority (typically Consul DNS) because the system resolver can't answer
// queries for the .consul domain. The SRV target is then flattened to an IP
// because the returned URL is consumed by net/http via cmd.PushMetrics, which
// resolves hostnames using the system resolver. Scheme is fixed to http:
// pushgateway is assumed to be on an internal network
func getPushgatewayURL(ctx context.Context, dnsAuthority string, svc cmd.ServiceDomain) (string, error) {
host, port, err := net.SplitHostPort(dnsAuthority)
if err != nil {
// Assume only hostname or IPv4 address was specified.
host = dnsAuthority
port = "53"
}
r := &net.Resolver{
PreferGo: true,
Dial: func(ctx context.Context, network, _ string) (net.Conn, error) {
return (&net.Dialer{}).DialContext(ctx, network, net.JoinHostPort(host, port))
},
}
_, targets, err := r.LookupSRV(ctx, svc.Service, "tcp", svc.Domain)
if err != nil {
return "", fmt.Errorf("SRV lookup of _%s._tcp.%s failed: %w", svc.Service, svc.Domain, err)
}
if len(targets) == 0 {
return "", fmt.Errorf("SRV lookup of _%s._tcp.%s returned 0 results", svc.Service, svc.Domain)
}
// Flatten the SRV target to an IP using the same Consul authority; net/http
// (used downstream) would otherwise try to resolve names like
// *.addr.dc1.consul via the system resolver and fail.
target := strings.TrimSuffix(targets[0].Target, ".")
addrs, err := r.LookupHost(ctx, target)
if err != nil {
return "", fmt.Errorf("A/AAAA lookup of %q failed: %w", target, err)
}
if len(addrs) == 0 {
return "", fmt.Errorf("A/AAAA lookup of %q returned 0 results", target)
}
return fmt.Sprintf("http://%s", net.JoinHostPort(addrs[0], fmt.Sprint(targets[0].Port))), nil
}

func main() {
configFile := flag.String("config", "", "File path to the configuration file for this service")
flag.Parse()
Expand All @@ -594,6 +651,9 @@ func main() {
logger := cmd.NewLogger(config.Syslog)
cmd.LogStartup(logger)

reg := prometheus.NewRegistry()
metrics := NewCertCheckerMetrics(reg)

acceptableValidityDurations := make(map[time.Duration]bool)
if len(config.CertChecker.AcceptableValidityDurations) > 0 {
for _, entry := range config.CertChecker.AcceptableValidityDurations {
Expand All @@ -616,11 +676,6 @@ func main() {
saDbMap, err := sa.InitWrappedDb(config.CertChecker.DB, prometheus.DefaultRegisterer, logger)
cmd.FailOnError(err, "While initializing dbMap")

checkerLatency := promauto.NewHistogram(prometheus.HistogramOpts{
Name: "cert_checker_latency",
Help: "Histogram of latencies a cert-checker worker takes to complete a batch",
})

pa, err := policy.New(config.PA.Identifiers, config.PA.Challenges, logger)
cmd.FailOnError(err, "Failed to create PA")

Expand Down Expand Up @@ -663,23 +718,34 @@ func main() {
fmt.Fprintf(os.Stderr, "# Processing certificates using %d workers\n", config.CertChecker.Workers)
wg := new(sync.WaitGroup)
for range config.CertChecker.Workers {
wg.Add(1)
go func() {
wg.Go(func() {
s := checker.clock.Now()
checker.processCerts(context.TODO(), wg, config.CertChecker.BadResultsOnly)
checkerLatency.Observe(checker.clock.Since(s).Seconds())
}()
checker.processCerts(context.TODO())
metrics.checkerLatency.Observe(checker.clock.Since(s).Seconds())
})
}
wg.Wait()
fmt.Fprintf(
os.Stderr,
"# Finished processing certificates, report length: %d, good: %d, bad: %d\n",
len(checker.issuedReport.Entries),
checker.issuedReport.GoodCerts,
checker.issuedReport.BadCerts,
)
err = checker.issuedReport.dump()
cmd.FailOnError(err, "Failed to dump results: %s\n")
logger.AuditInfo("Finished processing certificates", checker.issuedReport)

metrics.checkerTimestamp.SetToCurrentTime()
metrics.checkerGoodCount.Set(float64(checker.issuedReport.GoodCerts))
metrics.checkerBadCount.Set(float64(checker.issuedReport.BadCerts))

if config.CertChecker.PushgatewayService != nil {
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
pushgatewayURL, err := getPushgatewayURL(ctx, config.CertChecker.LookupDNSAuthority, *config.CertChecker.PushgatewayService)
if err != nil {
logger.Errf("failed to get pushgateway URL: %s", err)
} else {
err = cmd.PushMetrics("cert-checker", pushgatewayURL, reg, logger)
if err != nil {
logger.Errf("failed to push metrics to pushgateway: %s", err)
} else {
logger.Debugf("pushed metrics to pushgateway at %s", pushgatewayURL)
}
}
}

if checker.issuedReport.BadCerts > 0 {
os.Exit(1)
Expand Down
73 changes: 43 additions & 30 deletions cmd/cert-checker/main_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,19 @@ import (
"log"
"math/big"
mrand "math/rand/v2"
"net"
"net/url"
"os"
"slices"
"strconv"
"strings"
"sync"
"testing"
"time"

"github.com/jmhodges/clock"
"google.golang.org/protobuf/types/known/timestamppb"

"github.com/letsencrypt/boulder/cmd"
"github.com/letsencrypt/boulder/core"
corepb "github.com/letsencrypt/boulder/core/proto"
"github.com/letsencrypt/boulder/ctpolicy/loglist"
Expand Down Expand Up @@ -336,7 +339,8 @@ func TestGetAndProcessCerts(t *testing.T) {
fc := clock.NewFake()
fc.Set(fc.Now().Add(time.Hour))

checker := newChecker(saDbMap, fc, pa, kp, time.Hour, testValidityDurations, nil, blog.NewMock())
mocklog := blog.NewMock()
checker := newChecker(saDbMap, fc, pa, kp, time.Hour, testValidityDurations, nil, mocklog)
sa, err := sa.NewSQLStorageAuthority(saDbMap, saDbMap, nil, 0, fc, blog.NewMock(), metrics.NoopRegisterer)
test.AssertNotError(t, err, "Couldn't create SA to insert certificates")
saCleanUp := test.ResetBoulderTestDatabase(t)
Expand Down Expand Up @@ -375,11 +379,9 @@ func TestGetAndProcessCerts(t *testing.T) {
err = checker.getCerts(context.Background())
test.AssertNotError(t, err, "Failed to retrieve certificates")
test.AssertEquals(t, len(checker.certs), 5)
wg := new(sync.WaitGroup)
wg.Add(1)
checker.processCerts(context.Background(), wg, false)
checker.processCerts(context.Background())
test.AssertEquals(t, checker.issuedReport.BadCerts, int64(5))
test.AssertEquals(t, len(checker.issuedReport.Entries), 5)
test.AssertEquals(t, len(mocklog.GetAllMatching("certificate error found")), 5)
}

// mismatchedCountDB is a certDB implementation for `getCerts` that returns one
Expand Down Expand Up @@ -507,30 +509,6 @@ func TestGetCertsLate(t *testing.T) {
}
}

func TestSaveReport(t *testing.T) {
r := report{
begin: time.Time{},
end: time.Time{},
GoodCerts: 2,
BadCerts: 1,
Entries: map[string]reportEntry{
"020000000000004b475da49b91da5c17": {
Valid: true,
},
"020000000000004d1613e581432cba7e": {
Valid: true,
},
"020000000000004e402bc21035c6634a": {
Valid: false,
Problems: []string{"None really..."},
},
},
}

err := r.dump()
test.AssertNotError(t, err, "Failed to dump results")
}

func TestIsForbiddenDomain(t *testing.T) {
// Note: These testcases are not an exhaustive representation of domains
// Boulder won't issue for, but are instead testing the defense-in-depth
Expand Down Expand Up @@ -698,3 +676,38 @@ func TestPrecertCorrespond(t *testing.T) {
}
t.Fatalf("expected precert correspondence problem, but got: %v", problems)
}

func TestGetPushgatewayURL(t *testing.T) {
ctx := context.Background()
t.Run("happy path", func(t *testing.T) {
gotURL, err := getPushgatewayURL(ctx, "consul.service.consul:53",
cmd.ServiceDomain{Service: "redisratelimits", Domain: "service.consul"})
test.AssertNotError(t, err, "")

parsed, err := url.Parse(gotURL)
test.AssertNotError(t, err, "returned URL should be parseable")
test.AssertEquals(t, parsed.Scheme, "http")

host, port, err := net.SplitHostPort(parsed.Host)
test.AssertNotError(t, err, "URL host should contain a port")
test.AssertNotNil(t, net.ParseIP(host), "host should be an IP (LookupHost flatten step)")
portNum, err := strconv.Atoi(port)
test.AssertNotError(t, err, "port should be numeric")
test.Assert(t, portNum > 0 && portNum < 65536, "port should be in valid range")
})
t.Run("DNS authority no port specified", func(t *testing.T) {
_, err := getPushgatewayURL(ctx, "consul.service.consul",
cmd.ServiceDomain{Service: "redisratelimits", Domain: "service.consul"})
test.AssertNotError(t, err, "")
})
t.Run("SRV not found", func(t *testing.T) {
_, err := getPushgatewayURL(ctx, "consul.service.consul:53",
cmd.ServiceDomain{Service: "doesnotexist", Domain: "service.consul"})
test.AssertError(t, err, "")
})
t.Run("DNS authority unreachable", func(t *testing.T) {
_, err := getPushgatewayURL(ctx, "doesnotexist.invalid:53",
cmd.ServiceDomain{Service: "redisratelimits", Domain: "service.consul"})
test.AssertError(t, err, "")
})
}
13 changes: 13 additions & 0 deletions cmd/shell.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import (
"github.com/prometheus/client_golang/prometheus/collectors"
"github.com/prometheus/client_golang/prometheus/collectors/version"
"github.com/prometheus/client_golang/prometheus/promhttp"
"github.com/prometheus/client_golang/prometheus/push"
"github.com/redis/go-redis/v9"
"go.opentelemetry.io/otel"
"go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc"
Expand Down Expand Up @@ -575,3 +576,15 @@ func WaitForSignal() {
signal.Notify(sigChan, syscall.SIGHUP)
<-sigChan
}

func PushMetrics(jobname, pushgatewayURL string, gatherer prometheus.Gatherer, logger blog.Logger) error {
hostname, err := os.Hostname()
if err != nil {
logger.Warningf("error getting hostname: %s", err)
hostname = "unknown"
}
return push.New(pushgatewayURL, jobname).
Gatherer(gatherer).
Grouping("instance", hostname).
Comment thread
aarongable marked this conversation as resolved.
Push()
Comment thread
lenaunderwood22 marked this conversation as resolved.
}
Loading