From fae0b037df813169ba818c937212abe9c961cf9c Mon Sep 17 00:00:00 2001 From: Lena Date: Tue, 26 May 2026 14:20:29 -0700 Subject: [PATCH 01/20] cert-checker: fix logging & push metrics --- cmd/cert-checker/main.go | 94 +++-- cmd/cert-checker/main_test.go | 35 +- cmd/shell.go | 13 + .../client_golang/prometheus/push/push.go | 356 ++++++++++++++++++ vendor/modules.txt | 1 + 5 files changed, 434 insertions(+), 65 deletions(-) create mode 100644 vendor/github.com/prometheus/client_golang/prometheus/push/push.go diff --git a/cmd/cert-checker/main.go b/cmd/cert-checker/main.go index d1eeb08ef27..f0b33892112 100644 --- a/cmd/cert-checker/main.go +++ b/cmd/cert-checker/main.go @@ -4,7 +4,6 @@ import ( "bytes" "context" "crypto/x509" - "encoding/json" "flag" "fmt" "net/netip" @@ -38,6 +37,36 @@ import ( "github.com/letsencrypt/boulder/sa" ) +type certCheckerMetrics struct { + checkerLatency prometheus.Histogram + checkerTimestamp prometheus.Gauge + checkerGoodCount prometheus.Gauge + checkerBadCount prometheus.Gauge +} + +func NewCertCheckerMetrics(stats prometheus.Registerer) *certCheckerMetrics { + checkerLatency := promauto.With(stats).NewHistogram(prometheus.HistogramOpts{ + Name: "cert_checker_latency", + Help: "Histogram of latencies a cert-checker worker takes to complete a batch", + }) + + checkerTimestamp := promauto.With(stats).NewGauge(prometheus.GaugeOpts{ + Name: "cert_checker_last_run_timestamp", + Help: "Timestamp of cert-checker's last run", + }) + + checkerGoodCount := promauto.With(stats).NewGauge(prometheus.GaugeOpts{ + Name: "cert_checker_good_count", + Help: "Cert-checker count of good certificates", + }) + + checkerBadCount := promauto.With(stats).NewGauge(prometheus.GaugeOpts{ + Name: "cert_checker_bad_count", + Help: "Cert-checker count of bad certificates", + }) + return &certCheckerMetrics{checkerLatency, checkerTimestamp, checkerGoodCount, checkerBadCount} +} + // For defense-in-depth in addition to using the PA & its identPolicy to check // domain names we also perform a check against the regex's from the // forbiddenDomains array @@ -62,19 +91,10 @@ var batchSize = 1000 type report struct { begin time.Time end time.Time - GoodCerts int64 `json:"good-certs"` - BadCerts int64 `json:"bad-certs"` - DbErrs int64 `json:"db-errs"` - Entries map[string]reportEntry `json:"entries"` -} - -func (r *report) dump() error { - content, err := json.MarshalIndent(r, "", " ") - if err != nil { - return err - } - fmt.Fprintln(os.Stdout, string(content)) - return nil + GoodCerts int64 `json:"good-certs"` + BadCerts int64 `json:"bad-certs"` + DbErrs int64 `json:"db-errs"` + entries map[string]reportEntry } type reportEntry struct { @@ -134,7 +154,7 @@ func newChecker(saDbMap certDB, certs: make(chan *corepb.Certificate, batchSize), rMu: new(sync.Mutex), clock: clk, - issuedReport: report{Entries: make(map[string]reportEntry)}, + issuedReport: report{entries: make(map[string]reportEntry)}, checkPeriod: period, acceptableValidityDurations: avd, lints: lints, @@ -265,13 +285,13 @@ func (c *certChecker) getCerts(ctx context.Context) error { return nil } -func (c *certChecker) processCerts(ctx context.Context, wg *sync.WaitGroup, badResultsOnly bool) { +func (c *certChecker) processCerts(ctx context.Context, badResultsOnly bool) { for cert := range c.certs { sans, problems := c.checkCert(ctx, cert) valid := len(problems) == 0 c.rMu.Lock() if !badResultsOnly || (badResultsOnly && !valid) { - c.issuedReport.Entries[cert.Serial] = reportEntry{ + c.issuedReport.entries[cert.Serial] = reportEntry{ Valid: valid, SANs: sans, Problems: problems, @@ -280,11 +300,11 @@ func (c *certChecker) processCerts(ctx context.Context, wg *sync.WaitGroup, badR c.rMu.Unlock() if !valid { atomic.AddInt64(&c.issuedReport.BadCerts, 1) + c.logger.AuditErr("certificate error found", nil, map[string]any{"serial": cert.Serial, "sans": sans, "problems": problems}) } else { atomic.AddInt64(&c.issuedReport.GoodCerts, 1) } } - wg.Done() } // Extensions that we allow in certificates @@ -536,7 +556,8 @@ func (c *certChecker) checkCert(ctx context.Context, cert *corepb.Certificate) ( type Config struct { CertChecker struct { - DB cmd.DBConfig + DB cmd.DBConfig + PushgatewayURL string `validate:"omitempty,url"` cmd.HostnamePolicyConfig Workers int `validate:"required,min=1"` @@ -594,6 +615,9 @@ func main() { logger := cmd.NewLogger(config.Syslog) cmd.LogStartup(logger) + reg := prometheus.NewRegistry() + metrics := NewCertCheckerMetrics(reg) + acceptableValidityDurations := make(map[time.Duration]bool) if len(config.CertChecker.AcceptableValidityDurations) > 0 { for _, entry := range config.CertChecker.AcceptableValidityDurations { @@ -616,11 +640,6 @@ func main() { saDbMap, err := sa.InitWrappedDb(config.CertChecker.DB, prometheus.DefaultRegisterer, logger) cmd.FailOnError(err, "While initializing dbMap") - checkerLatency := promauto.NewHistogram(prometheus.HistogramOpts{ - Name: "cert_checker_latency", - Help: "Histogram of latencies a cert-checker worker takes to complete a batch", - }) - pa, err := policy.New(config.PA.Identifiers, config.PA.Challenges, logger) cmd.FailOnError(err, "Failed to create PA") @@ -665,21 +684,26 @@ func main() { for range config.CertChecker.Workers { wg.Add(1) go func() { + defer wg.Done() s := checker.clock.Now() - checker.processCerts(context.TODO(), wg, config.CertChecker.BadResultsOnly) - checkerLatency.Observe(checker.clock.Since(s).Seconds()) + checker.processCerts(context.TODO(), config.CertChecker.BadResultsOnly) + metrics.checkerLatency.Observe(checker.clock.Since(s).Seconds()) }() } wg.Wait() - fmt.Fprintf( - os.Stderr, - "# Finished processing certificates, report length: %d, good: %d, bad: %d\n", - len(checker.issuedReport.Entries), - checker.issuedReport.GoodCerts, - checker.issuedReport.BadCerts, - ) - err = checker.issuedReport.dump() - cmd.FailOnError(err, "Failed to dump results: %s\n") + logger.AuditInfo("Finished processing certificates", checker.issuedReport) + + metrics.checkerTimestamp.SetToCurrentTime() + metrics.checkerGoodCount.Set(float64(checker.issuedReport.GoodCerts)) + metrics.checkerBadCount.Set(float64(checker.issuedReport.BadCerts)) + + if config.CertChecker.PushgatewayURL != "" { + if err = cmd.PushMetrics("cert-checker", config.CertChecker.PushgatewayURL, reg, logger); err != nil { + logger.Warningf("failed to push metrics to pushgateway: %s", err) + } else { + logger.Infof("pushed metrics to pushgateway at %s", config.CertChecker.PushgatewayURL) + } + } } func init() { diff --git a/cmd/cert-checker/main_test.go b/cmd/cert-checker/main_test.go index 9ffcf1beafa..77ababf415e 100644 --- a/cmd/cert-checker/main_test.go +++ b/cmd/cert-checker/main_test.go @@ -17,7 +17,6 @@ import ( "os" "slices" "strings" - "sync" "testing" "time" @@ -336,7 +335,8 @@ func TestGetAndProcessCerts(t *testing.T) { fc := clock.NewFake() fc.Set(fc.Now().Add(time.Hour)) - checker := newChecker(saDbMap, fc, pa, kp, time.Hour, testValidityDurations, nil, blog.NewMock()) + mocklog := blog.NewMock() + checker := newChecker(saDbMap, fc, pa, kp, time.Hour, testValidityDurations, nil, mocklog) sa, err := sa.NewSQLStorageAuthority(saDbMap, saDbMap, nil, 0, fc, blog.NewMock(), metrics.NoopRegisterer) test.AssertNotError(t, err, "Couldn't create SA to insert certificates") saCleanUp := test.ResetBoulderTestDatabase(t) @@ -375,11 +375,10 @@ func TestGetAndProcessCerts(t *testing.T) { err = checker.getCerts(context.Background()) test.AssertNotError(t, err, "Failed to retrieve certificates") test.AssertEquals(t, len(checker.certs), 5) - wg := new(sync.WaitGroup) - wg.Add(1) - checker.processCerts(context.Background(), wg, false) + checker.processCerts(context.Background(), false) test.AssertEquals(t, checker.issuedReport.BadCerts, int64(5)) - test.AssertEquals(t, len(checker.issuedReport.Entries), 5) + test.AssertEquals(t, len(checker.issuedReport.entries), 5) + test.AssertEquals(t, len(mocklog.GetAllMatching("certificate error found")), 5) } // mismatchedCountDB is a certDB implementation for `getCerts` that returns one @@ -507,30 +506,6 @@ func TestGetCertsLate(t *testing.T) { } } -func TestSaveReport(t *testing.T) { - r := report{ - begin: time.Time{}, - end: time.Time{}, - GoodCerts: 2, - BadCerts: 1, - Entries: map[string]reportEntry{ - "020000000000004b475da49b91da5c17": { - Valid: true, - }, - "020000000000004d1613e581432cba7e": { - Valid: true, - }, - "020000000000004e402bc21035c6634a": { - Valid: false, - Problems: []string{"None really..."}, - }, - }, - } - - err := r.dump() - test.AssertNotError(t, err, "Failed to dump results") -} - func TestIsForbiddenDomain(t *testing.T) { // Note: These testcases are not an exhaustive representation of domains // Boulder won't issue for, but are instead testing the defense-in-depth diff --git a/cmd/shell.go b/cmd/shell.go index 2b3509f5aaa..42d0f474f8d 100644 --- a/cmd/shell.go +++ b/cmd/shell.go @@ -26,6 +26,7 @@ import ( "github.com/prometheus/client_golang/prometheus/collectors" "github.com/prometheus/client_golang/prometheus/collectors/version" "github.com/prometheus/client_golang/prometheus/promhttp" + "github.com/prometheus/client_golang/prometheus/push" "github.com/redis/go-redis/v9" "go.opentelemetry.io/otel" "go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc" @@ -575,3 +576,15 @@ func WaitForSignal() { signal.Notify(sigChan, syscall.SIGHUP) <-sigChan } + +func PushMetrics(jobname, pushgatewayURL string, gatherer prometheus.Gatherer, logger blog.Logger) error { + hostname, err := os.Hostname() + if err != nil { + logger.Warningf("error getting hostname: %s", err) + hostname = "unknown" + } + return push.New(pushgatewayURL, jobname). + Gatherer(gatherer). + Grouping("instance", hostname). + Push() +} diff --git a/vendor/github.com/prometheus/client_golang/prometheus/push/push.go b/vendor/github.com/prometheus/client_golang/prometheus/push/push.go new file mode 100644 index 00000000000..e524aa1303e --- /dev/null +++ b/vendor/github.com/prometheus/client_golang/prometheus/push/push.go @@ -0,0 +1,356 @@ +// Copyright 2015 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package push provides functions to push metrics to a Pushgateway. It uses a +// builder approach. Create a Pusher with New and then add the various options +// by using its methods, finally calling Add or Push, like this: +// +// // Easy case: +// push.New("http://example.org/metrics", "my_job").Gatherer(myRegistry).Push() +// +// // Complex case: +// push.New("http://example.org/metrics", "my_job"). +// Collector(myCollector1). +// Collector(myCollector2). +// Grouping("zone", "xy"). +// Client(&myHTTPClient). +// BasicAuth("top", "secret"). +// Add() +// +// See the examples section for more detailed examples. +// +// See the documentation of the Pushgateway to understand the meaning of +// the grouping key and the differences between Push and Add: +// https://github.com/prometheus/pushgateway +package push + +import ( + "bytes" + "context" + "encoding/base64" + "errors" + "fmt" + "io" + "net/http" + "net/url" + "strings" + + "github.com/prometheus/common/expfmt" + "github.com/prometheus/common/model" + + "github.com/prometheus/client_golang/prometheus" +) + +const ( + contentTypeHeader = "Content-Type" + // base64Suffix is appended to a label name in the request URL path to + // mark the following label value as base64 encoded. + base64Suffix = "@base64" +) + +var errJobEmpty = errors.New("job name is empty") + +// HTTPDoer is an interface for the one method of http.Client that is used by Pusher +type HTTPDoer interface { + Do(*http.Request) (*http.Response, error) +} + +// Pusher manages a push to the Pushgateway. Use New to create one, configure it +// with its methods, and finally use the Add or Push method to push. +type Pusher struct { + error error + + url, job string + grouping map[string]string + + gatherers prometheus.Gatherers + registerer prometheus.Registerer + + client HTTPDoer + header http.Header + useBasicAuth bool + username, password string + + expfmt expfmt.Format +} + +// New creates a new Pusher to push to the provided URL with the provided job +// name (which must not be empty). You can use just host:port or ip:port as url, +// in which case “http://” is added automatically. Alternatively, include the +// schema in the URL. However, do not include the “/metrics/jobs/…” part. +func New(url, job string) *Pusher { + var ( + reg = prometheus.NewRegistry() + err error + ) + if job == "" { + err = errJobEmpty + } + if !strings.Contains(url, "://") { + url = "http://" + url + } + url = strings.TrimSuffix(url, "/") + + return &Pusher{ + error: err, + url: url, + job: job, + grouping: map[string]string{}, + gatherers: prometheus.Gatherers{reg}, + registerer: reg, + client: &http.Client{}, + expfmt: expfmt.NewFormat(expfmt.TypeProtoDelim), + } +} + +// Push collects/gathers all metrics from all Collectors and Gatherers added to +// this Pusher. Then, it pushes them to the Pushgateway configured while +// creating this Pusher, using the configured job name and any added grouping +// labels as grouping key. All previously pushed metrics with the same job and +// other grouping labels will be replaced with the metrics pushed by this +// call. (It uses HTTP method “PUT” to push to the Pushgateway.) +// +// Push returns the first error encountered by any method call (including this +// one) in the lifetime of the Pusher. +func (p *Pusher) Push() error { + return p.push(context.Background(), http.MethodPut) +} + +// PushContext is like Push but includes a context. +// +// If the context expires before HTTP request is complete, an error is returned. +func (p *Pusher) PushContext(ctx context.Context) error { + return p.push(ctx, http.MethodPut) +} + +// Add works like push, but only previously pushed metrics with the same name +// (and the same job and other grouping labels) will be replaced. (It uses HTTP +// method “POST” to push to the Pushgateway.) +func (p *Pusher) Add() error { + return p.push(context.Background(), http.MethodPost) +} + +// AddContext is like Add but includes a context. +// +// If the context expires before HTTP request is complete, an error is returned. +func (p *Pusher) AddContext(ctx context.Context) error { + return p.push(ctx, http.MethodPost) +} + +// Gatherer adds a Gatherer to the Pusher, from which metrics will be gathered +// to push them to the Pushgateway. The gathered metrics must not contain a job +// label of their own. +// +// For convenience, this method returns a pointer to the Pusher itself. +func (p *Pusher) Gatherer(g prometheus.Gatherer) *Pusher { + p.gatherers = append(p.gatherers, g) + return p +} + +// Collector adds a Collector to the Pusher, from which metrics will be +// collected to push them to the Pushgateway. The collected metrics must not +// contain a job label of their own. +// +// For convenience, this method returns a pointer to the Pusher itself. +func (p *Pusher) Collector(c prometheus.Collector) *Pusher { + if p.error == nil { + p.error = p.registerer.Register(c) + } + return p +} + +// Error returns the error that was encountered. +func (p *Pusher) Error() error { + return p.error +} + +// Grouping adds a label pair to the grouping key of the Pusher, replacing any +// previously added label pair with the same label name. Note that setting any +// labels in the grouping key that are already contained in the metrics to push +// will lead to an error. +// +// For convenience, this method returns a pointer to the Pusher itself. +func (p *Pusher) Grouping(name, value string) *Pusher { + if p.error == nil { + if !model.LabelName(name).IsValid() { + p.error = fmt.Errorf("grouping label has invalid name: %s", name) + return p + } + p.grouping[name] = value + } + return p +} + +// Client sets a custom HTTP client for the Pusher. For convenience, this method +// returns a pointer to the Pusher itself. +// Pusher only needs one method of the custom HTTP client: Do(*http.Request). +// Thus, rather than requiring a fully fledged http.Client, +// the provided client only needs to implement the HTTPDoer interface. +// Since *http.Client naturally implements that interface, it can still be used normally. +func (p *Pusher) Client(c HTTPDoer) *Pusher { + p.client = c + return p +} + +// Header sets a custom HTTP header for the Pusher's client. For convenience, this method +// returns a pointer to the Pusher itself. +func (p *Pusher) Header(header http.Header) *Pusher { + p.header = header + return p +} + +// BasicAuth configures the Pusher to use HTTP Basic Authentication with the +// provided username and password. For convenience, this method returns a +// pointer to the Pusher itself. +func (p *Pusher) BasicAuth(username, password string) *Pusher { + p.useBasicAuth = true + p.username = username + p.password = password + return p +} + +// Format configures the Pusher to use an encoding format given by the +// provided expfmt.Format. The default format is expfmt.FmtProtoDelim and +// should be used with the standard Prometheus Pushgateway. Custom +// implementations may require different formats. For convenience, this +// method returns a pointer to the Pusher itself. +func (p *Pusher) Format(format expfmt.Format) *Pusher { + p.expfmt = format + return p +} + +// Delete sends a “DELETE” request to the Pushgateway configured while creating +// this Pusher, using the configured job name and any added grouping labels as +// grouping key. Any added Gatherers and Collectors added to this Pusher are +// ignored by this method. +// +// Delete returns the first error encountered by any method call (including this +// one) in the lifetime of the Pusher. +func (p *Pusher) Delete() error { + if p.error != nil { + return p.error + } + req, err := http.NewRequest(http.MethodDelete, p.fullURL(), nil) + if err != nil { + return err + } + if p.header != nil { + req.Header = p.header + } + if p.useBasicAuth { + req.SetBasicAuth(p.username, p.password) + } + resp, err := p.client.Do(req) + if err != nil { + return err + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusAccepted { + body, _ := io.ReadAll(resp.Body) // Ignore any further error as this is for an error message only. + return fmt.Errorf("unexpected status code %d while deleting %s: %s", resp.StatusCode, p.fullURL(), body) + } + return nil +} + +func (p *Pusher) push(ctx context.Context, method string) error { + if p.error != nil { + return p.error + } + mfs, err := p.gatherers.Gather() + if err != nil { + return err + } + buf := &bytes.Buffer{} + enc := expfmt.NewEncoder(buf, p.expfmt) + // Check for pre-existing grouping labels: + for _, mf := range mfs { + for _, m := range mf.GetMetric() { + for _, l := range m.GetLabel() { + if l.GetName() == "job" { + return fmt.Errorf("pushed metric %s (%s) already contains a job label", mf.GetName(), m) + } + if _, ok := p.grouping[l.GetName()]; ok { + return fmt.Errorf( + "pushed metric %s (%s) already contains grouping label %s", + mf.GetName(), m, l.GetName(), + ) + } + } + } + if err := enc.Encode(mf); err != nil { + return fmt.Errorf( + "failed to encode metric family %s, error is %w", + mf.GetName(), err) + } + } + req, err := http.NewRequestWithContext(ctx, method, p.fullURL(), buf) + if err != nil { + return err + } + if p.header != nil { + req.Header = p.header + } + if p.useBasicAuth { + req.SetBasicAuth(p.username, p.password) + } + req.Header.Set(contentTypeHeader, string(p.expfmt)) + resp, err := p.client.Do(req) + if err != nil { + return err + } + defer resp.Body.Close() + // Depending on version and configuration of the PGW, StatusOK or StatusAccepted may be returned. + if resp.StatusCode != http.StatusOK && resp.StatusCode != http.StatusAccepted { + body, _ := io.ReadAll(resp.Body) // Ignore any further error as this is for an error message only. + return fmt.Errorf("unexpected status code %d while pushing to %s: %s", resp.StatusCode, p.fullURL(), body) + } + return nil +} + +// fullURL assembles the URL used to push/delete metrics and returns it as a +// string. The job name and any grouping label values containing a '/' will +// trigger a base64 encoding of the affected component and proper suffixing of +// the preceding component. Similarly, an empty grouping label value will be +// encoded as base64 just with a single `=` padding character (to avoid an empty +// path component). If the component does not contain a '/' but other special +// characters, the usual url.QueryEscape is used for compatibility with older +// versions of the Pushgateway and for better readability. +func (p *Pusher) fullURL() string { + urlComponents := []string{} + if encodedJob, base64 := encodeComponent(p.job); base64 { + urlComponents = append(urlComponents, "job"+base64Suffix, encodedJob) + } else { + urlComponents = append(urlComponents, "job", encodedJob) + } + for ln, lv := range p.grouping { + if encodedLV, base64 := encodeComponent(lv); base64 { + urlComponents = append(urlComponents, ln+base64Suffix, encodedLV) + } else { + urlComponents = append(urlComponents, ln, encodedLV) + } + } + return fmt.Sprintf("%s/metrics/%s", p.url, strings.Join(urlComponents, "/")) +} + +// encodeComponent encodes the provided string with base64.RawURLEncoding in +// case it contains '/' and as "=" in case it is empty. If neither is the case, +// it uses url.QueryEscape instead. It returns true in the former two cases. +func encodeComponent(s string) (string, bool) { + if s == "" { + return "=", true + } + if strings.Contains(s, "/") { + return base64.RawURLEncoding.EncodeToString([]byte(s)), true + } + return url.QueryEscape(s), false +} diff --git a/vendor/modules.txt b/vendor/modules.txt index bd8d5f74c5f..512534dece7 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -255,6 +255,7 @@ github.com/prometheus/client_golang/prometheus/internal github.com/prometheus/client_golang/prometheus/promauto github.com/prometheus/client_golang/prometheus/promhttp github.com/prometheus/client_golang/prometheus/promhttp/internal +github.com/prometheus/client_golang/prometheus/push # github.com/prometheus/client_model v0.6.1 ## explicit; go 1.19 github.com/prometheus/client_model/go From 1bbc8e4aee363015d8ce4ff9cbc473f5b58e7799 Mon Sep 17 00:00:00 2001 From: Lena Date: Tue, 26 May 2026 14:24:54 -0700 Subject: [PATCH 02/20] add pushgatewayURL to test configs --- test/config-next/cert-checker.json | 1 + test/config/cert-checker.json | 1 + 2 files changed, 2 insertions(+) diff --git a/test/config-next/cert-checker.json b/test/config-next/cert-checker.json index 34d34e3ec5b..c61d6c3fd31 100644 --- a/test/config-next/cert-checker.json +++ b/test/config-next/cert-checker.json @@ -4,6 +4,7 @@ "dbConnectFile": "test/secrets/cert_checker_dburl", "maxOpenConns": 10 }, + "pushgatewayURL": "", "hostnamePolicyFile": "test/ident-policy.yaml", "workers": 16, "unexpiredOnly": true, diff --git a/test/config/cert-checker.json b/test/config/cert-checker.json index aee0e9ae6cf..3191da5d93e 100644 --- a/test/config/cert-checker.json +++ b/test/config/cert-checker.json @@ -4,6 +4,7 @@ "dbConnectFile": "test/secrets/cert_checker_dburl", "maxOpenConns": 10 }, + "pushgatewayURL": "", "hostnamePolicyFile": "test/ident-policy.yaml", "workers": 16, "unexpiredOnly": true, From 7d26e0427780826f6b7ad6dabd5579a09165851f Mon Sep 17 00:00:00 2001 From: Lena Date: Tue, 26 May 2026 14:29:54 -0700 Subject: [PATCH 03/20] lena can't syntax in the merge conflict editor --- cmd/cert-checker/main.go | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cmd/cert-checker/main.go b/cmd/cert-checker/main.go index c38adfb3496..57bb32c7a3f 100644 --- a/cmd/cert-checker/main.go +++ b/cmd/cert-checker/main.go @@ -703,8 +703,9 @@ func main() { } else { logger.Infof("pushed metrics to pushgateway at %s", config.CertChecker.PushgatewayURL) } - - if checker.issuedReport.BadCerts > 0 { + } + + if checker.issuedReport.BadCerts > 0 { os.Exit(1) } } From a26009e48b0c04789a7a724db63cfccfbcd132b2 Mon Sep 17 00:00:00 2001 From: lenaunderwood22 <46858616+lenaunderwood22@users.noreply.github.com> Date: Thu, 28 May 2026 14:10:33 -0700 Subject: [PATCH 04/20] Update cmd/cert-checker/main.go Co-authored-by: Aaron Gable --- cmd/cert-checker/main.go | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/cmd/cert-checker/main.go b/cmd/cert-checker/main.go index 57bb32c7a3f..c9f1ef2d5cb 100644 --- a/cmd/cert-checker/main.go +++ b/cmd/cert-checker/main.go @@ -682,9 +682,7 @@ func main() { fmt.Fprintf(os.Stderr, "# Processing certificates using %d workers\n", config.CertChecker.Workers) wg := new(sync.WaitGroup) for range config.CertChecker.Workers { - wg.Add(1) - go func() { - defer wg.Done() + wg.Go(func() { s := checker.clock.Now() checker.processCerts(context.TODO(), config.CertChecker.BadResultsOnly) metrics.checkerLatency.Observe(checker.clock.Since(s).Seconds()) From 37ec383b86c3f600c63dabbd37f73799226cdc12 Mon Sep 17 00:00:00 2001 From: lenaunderwood22 <46858616+lenaunderwood22@users.noreply.github.com> Date: Thu, 28 May 2026 14:27:23 -0700 Subject: [PATCH 05/20] Update cmd/cert-checker/main.go Co-authored-by: Aaron Gable --- cmd/cert-checker/main.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cmd/cert-checker/main.go b/cmd/cert-checker/main.go index c9f1ef2d5cb..bc0127ed74c 100644 --- a/cmd/cert-checker/main.go +++ b/cmd/cert-checker/main.go @@ -696,7 +696,8 @@ func main() { metrics.checkerBadCount.Set(float64(checker.issuedReport.BadCerts)) if config.CertChecker.PushgatewayURL != "" { - if err = cmd.PushMetrics("cert-checker", config.CertChecker.PushgatewayURL, reg, logger); err != nil { + err = cmd.PushMetrics("cert-checker", config.CertChecker.PushgatewayURL, reg, logger) + if err != nil { logger.Warningf("failed to push metrics to pushgateway: %s", err) } else { logger.Infof("pushed metrics to pushgateway at %s", config.CertChecker.PushgatewayURL) From bd919b81f3fcf01e98c50dbbd595e46ba22f50da Mon Sep 17 00:00:00 2001 From: Lena Date: Thu, 28 May 2026 14:30:57 -0700 Subject: [PATCH 06/20] rename certCheckerMetrics to metrics; remove unexpiredOnly conf; small nit fixes --- cmd/cert-checker/main.go | 19 +++++++++---------- cmd/cert-checker/main_test.go | 4 ++-- test/config-next/cert-checker.json | 1 - test/config/cert-checker.json | 1 - 4 files changed, 11 insertions(+), 14 deletions(-) diff --git a/cmd/cert-checker/main.go b/cmd/cert-checker/main.go index bc0127ed74c..4d6e4066dba 100644 --- a/cmd/cert-checker/main.go +++ b/cmd/cert-checker/main.go @@ -37,14 +37,14 @@ import ( "github.com/letsencrypt/boulder/sa" ) -type certCheckerMetrics struct { +type metrics struct { checkerLatency prometheus.Histogram checkerTimestamp prometheus.Gauge checkerGoodCount prometheus.Gauge checkerBadCount prometheus.Gauge } -func NewCertCheckerMetrics(stats prometheus.Registerer) *certCheckerMetrics { +func NewCertCheckerMetrics(stats prometheus.Registerer) *metrics { checkerLatency := promauto.With(stats).NewHistogram(prometheus.HistogramOpts{ Name: "cert_checker_latency", Help: "Histogram of latencies a cert-checker worker takes to complete a batch", @@ -64,7 +64,7 @@ func NewCertCheckerMetrics(stats prometheus.Registerer) *certCheckerMetrics { Name: "cert_checker_bad_count", Help: "Cert-checker count of bad certificates", }) - return &certCheckerMetrics{checkerLatency, checkerTimestamp, checkerGoodCount, checkerBadCount} + return &metrics{checkerLatency, checkerTimestamp, checkerGoodCount, checkerBadCount} } // For defense-in-depth in addition to using the PA & its identPolicy to check @@ -556,13 +556,12 @@ func (c *certChecker) checkCert(ctx context.Context, cert *corepb.Certificate) ( type Config struct { CertChecker struct { - DB cmd.DBConfig - PushgatewayURL string `validate:"omitempty,url"` + DB cmd.DBConfig cmd.HostnamePolicyConfig - Workers int `validate:"required,min=1"` + Workers int `validate:"required,min=1"` + PushgatewayURL string `validate:"omitempty,url"` // Deprecated: this is ignored, and cert checker always checks both expired and unexpired. - UnexpiredOnly bool BadResultsOnly bool CheckPeriod config.Duration @@ -686,7 +685,7 @@ func main() { s := checker.clock.Now() checker.processCerts(context.TODO(), config.CertChecker.BadResultsOnly) metrics.checkerLatency.Observe(checker.clock.Since(s).Seconds()) - }() + }) } wg.Wait() logger.AuditInfo("Finished processing certificates", checker.issuedReport) @@ -698,9 +697,9 @@ func main() { if config.CertChecker.PushgatewayURL != "" { err = cmd.PushMetrics("cert-checker", config.CertChecker.PushgatewayURL, reg, logger) if err != nil { - logger.Warningf("failed to push metrics to pushgateway: %s", err) + logger.Errf("failed to push metrics to pushgateway: %s", err) } else { - logger.Infof("pushed metrics to pushgateway at %s", config.CertChecker.PushgatewayURL) + logger.Debugf("pushed metrics to pushgateway at %s", config.CertChecker.PushgatewayURL) } } diff --git a/cmd/cert-checker/main_test.go b/cmd/cert-checker/main_test.go index 77ababf415e..6738173ceb6 100644 --- a/cmd/cert-checker/main_test.go +++ b/cmd/cert-checker/main_test.go @@ -31,7 +31,7 @@ import ( "github.com/letsencrypt/boulder/identifier" "github.com/letsencrypt/boulder/linter" blog "github.com/letsencrypt/boulder/log" - "github.com/letsencrypt/boulder/metrics" + m "github.com/letsencrypt/boulder/metrics" "github.com/letsencrypt/boulder/policy" "github.com/letsencrypt/boulder/sa" sapb "github.com/letsencrypt/boulder/sa/proto" @@ -337,7 +337,7 @@ func TestGetAndProcessCerts(t *testing.T) { mocklog := blog.NewMock() checker := newChecker(saDbMap, fc, pa, kp, time.Hour, testValidityDurations, nil, mocklog) - sa, err := sa.NewSQLStorageAuthority(saDbMap, saDbMap, nil, 0, fc, blog.NewMock(), metrics.NoopRegisterer) + sa, err := sa.NewSQLStorageAuthority(saDbMap, saDbMap, nil, 0, fc, blog.NewMock(), m.NoopRegisterer) test.AssertNotError(t, err, "Couldn't create SA to insert certificates") saCleanUp := test.ResetBoulderTestDatabase(t) defer func() { diff --git a/test/config-next/cert-checker.json b/test/config-next/cert-checker.json index 5274b85f729..8dd7d6e6ea5 100644 --- a/test/config-next/cert-checker.json +++ b/test/config-next/cert-checker.json @@ -7,7 +7,6 @@ "pushgatewayURL": "", "hostnamePolicyFile": "test/ident-policy.yaml", "workers": 16, - "unexpiredOnly": true, "badResultsOnly": true, "checkPeriod": "72h", "acceptableValidityDurations": [ diff --git a/test/config/cert-checker.json b/test/config/cert-checker.json index bd0ce9090b9..747f5eefe76 100644 --- a/test/config/cert-checker.json +++ b/test/config/cert-checker.json @@ -7,7 +7,6 @@ "pushgatewayURL": "", "hostnamePolicyFile": "test/ident-policy.yaml", "workers": 16, - "unexpiredOnly": true, "badResultsOnly": true, "checkPeriod": "72h", "acceptableValidityDurations": [ From e02607b1d39a25e77f4f49de10dace8e87147eb5 Mon Sep 17 00:00:00 2001 From: Lena Date: Thu, 28 May 2026 14:45:09 -0700 Subject: [PATCH 07/20] remove 'entries' from report struct --- cmd/cert-checker/main.go | 11 +---------- cmd/cert-checker/main_test.go | 1 - 2 files changed, 1 insertion(+), 11 deletions(-) diff --git a/cmd/cert-checker/main.go b/cmd/cert-checker/main.go index 4d6e4066dba..bb8a9bb649b 100644 --- a/cmd/cert-checker/main.go +++ b/cmd/cert-checker/main.go @@ -94,7 +94,6 @@ type report struct { GoodCerts int64 `json:"good-certs"` BadCerts int64 `json:"bad-certs"` DbErrs int64 `json:"db-errs"` - entries map[string]reportEntry } type reportEntry struct { @@ -154,7 +153,6 @@ func newChecker(saDbMap certDB, certs: make(chan *corepb.Certificate, batchSize), rMu: new(sync.Mutex), clock: clk, - issuedReport: report{entries: make(map[string]reportEntry)}, checkPeriod: period, acceptableValidityDurations: avd, lints: lints, @@ -290,13 +288,6 @@ func (c *certChecker) processCerts(ctx context.Context, badResultsOnly bool) { sans, problems := c.checkCert(ctx, cert) valid := len(problems) == 0 c.rMu.Lock() - if !badResultsOnly || (badResultsOnly && !valid) { - c.issuedReport.entries[cert.Serial] = reportEntry{ - Valid: valid, - SANs: sans, - Problems: problems, - } - } c.rMu.Unlock() if !valid { atomic.AddInt64(&c.issuedReport.BadCerts, 1) @@ -561,7 +552,7 @@ type Config struct { Workers int `validate:"required,min=1"` PushgatewayURL string `validate:"omitempty,url"` - // Deprecated: this is ignored, and cert checker always checks both expired and unexpired. + // Deprecated: cert-checker only logs bad results anyway. BadResultsOnly bool CheckPeriod config.Duration diff --git a/cmd/cert-checker/main_test.go b/cmd/cert-checker/main_test.go index 6738173ceb6..b8b695509a0 100644 --- a/cmd/cert-checker/main_test.go +++ b/cmd/cert-checker/main_test.go @@ -377,7 +377,6 @@ func TestGetAndProcessCerts(t *testing.T) { test.AssertEquals(t, len(checker.certs), 5) checker.processCerts(context.Background(), false) test.AssertEquals(t, checker.issuedReport.BadCerts, int64(5)) - test.AssertEquals(t, len(checker.issuedReport.entries), 5) test.AssertEquals(t, len(mocklog.GetAllMatching("certificate error found")), 5) } From be9ce40819a86b9ede1972f6281293806d20061f Mon Sep 17 00:00:00 2001 From: Lena Date: Thu, 28 May 2026 14:55:33 -0700 Subject: [PATCH 08/20] drop reportEntry too --- cmd/cert-checker/main.go | 6 ------ 1 file changed, 6 deletions(-) diff --git a/cmd/cert-checker/main.go b/cmd/cert-checker/main.go index bb8a9bb649b..5f90ba00c61 100644 --- a/cmd/cert-checker/main.go +++ b/cmd/cert-checker/main.go @@ -96,12 +96,6 @@ type report struct { DbErrs int64 `json:"db-errs"` } -type reportEntry struct { - Valid bool `json:"valid"` - SANs []string `json:"sans"` - Problems []string `json:"problems,omitempty"` -} - // certDB is an interface collecting the borp.DbMap functions that the various // parts of cert-checker rely on. Using this adapter shim allows tests to swap // out the saDbMap implementation. From c5e0cf61342cd171a31c9485d5b148a491133089 Mon Sep 17 00:00:00 2001 From: Lena Date: Thu, 28 May 2026 15:03:04 -0700 Subject: [PATCH 09/20] remove badresultsonly references --- cmd/cert-checker/main.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmd/cert-checker/main.go b/cmd/cert-checker/main.go index 5f90ba00c61..6e9a95887de 100644 --- a/cmd/cert-checker/main.go +++ b/cmd/cert-checker/main.go @@ -277,7 +277,7 @@ func (c *certChecker) getCerts(ctx context.Context) error { return nil } -func (c *certChecker) processCerts(ctx context.Context, badResultsOnly bool) { +func (c *certChecker) processCerts(ctx context.Context) { for cert := range c.certs { sans, problems := c.checkCert(ctx, cert) valid := len(problems) == 0 @@ -668,7 +668,7 @@ func main() { for range config.CertChecker.Workers { wg.Go(func() { s := checker.clock.Now() - checker.processCerts(context.TODO(), config.CertChecker.BadResultsOnly) + checker.processCerts(context.TODO()) metrics.checkerLatency.Observe(checker.clock.Since(s).Seconds()) }) } From 96eb1c2bc2cc8cc95b087e998d44a81b0dbf4bb6 Mon Sep 17 00:00:00 2001 From: Lena Date: Thu, 28 May 2026 15:07:11 -0700 Subject: [PATCH 10/20] one more --- cmd/cert-checker/main_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmd/cert-checker/main_test.go b/cmd/cert-checker/main_test.go index b8b695509a0..5f888b1ea78 100644 --- a/cmd/cert-checker/main_test.go +++ b/cmd/cert-checker/main_test.go @@ -375,7 +375,7 @@ func TestGetAndProcessCerts(t *testing.T) { err = checker.getCerts(context.Background()) test.AssertNotError(t, err, "Failed to retrieve certificates") test.AssertEquals(t, len(checker.certs), 5) - checker.processCerts(context.Background(), false) + checker.processCerts(context.Background()) test.AssertEquals(t, checker.issuedReport.BadCerts, int64(5)) test.AssertEquals(t, len(mocklog.GetAllMatching("certificate error found")), 5) } From a09d4b116de6733371e11be1a127987176829ce0 Mon Sep 17 00:00:00 2001 From: Lena Date: Thu, 28 May 2026 15:14:47 -0700 Subject: [PATCH 11/20] more cleanup --- cmd/cert-checker/main.go | 2 -- 1 file changed, 2 deletions(-) diff --git a/cmd/cert-checker/main.go b/cmd/cert-checker/main.go index 6e9a95887de..a602ddd1bba 100644 --- a/cmd/cert-checker/main.go +++ b/cmd/cert-checker/main.go @@ -281,8 +281,6 @@ func (c *certChecker) processCerts(ctx context.Context) { for cert := range c.certs { sans, problems := c.checkCert(ctx, cert) valid := len(problems) == 0 - c.rMu.Lock() - c.rMu.Unlock() if !valid { atomic.AddInt64(&c.issuedReport.BadCerts, 1) c.logger.AuditErr("certificate error found", nil, map[string]any{"serial": cert.Serial, "sans": sans, "problems": problems}) From 15ce5866bdd40eb2b937e1607a11b592b8d83922 Mon Sep 17 00:00:00 2001 From: Lena Date: Fri, 29 May 2026 07:50:11 -0700 Subject: [PATCH 12/20] rename metrics back to certCheckerMetrics --- cmd/cert-checker/main.go | 6 +++--- cmd/cert-checker/main_test.go | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cmd/cert-checker/main.go b/cmd/cert-checker/main.go index a602ddd1bba..f254aa9c0f4 100644 --- a/cmd/cert-checker/main.go +++ b/cmd/cert-checker/main.go @@ -37,14 +37,14 @@ import ( "github.com/letsencrypt/boulder/sa" ) -type metrics struct { +type certCheckerMetrics struct { checkerLatency prometheus.Histogram checkerTimestamp prometheus.Gauge checkerGoodCount prometheus.Gauge checkerBadCount prometheus.Gauge } -func NewCertCheckerMetrics(stats prometheus.Registerer) *metrics { +func NewCertCheckerMetrics(stats prometheus.Registerer) *certCheckerMetrics { checkerLatency := promauto.With(stats).NewHistogram(prometheus.HistogramOpts{ Name: "cert_checker_latency", Help: "Histogram of latencies a cert-checker worker takes to complete a batch", @@ -64,7 +64,7 @@ func NewCertCheckerMetrics(stats prometheus.Registerer) *metrics { Name: "cert_checker_bad_count", Help: "Cert-checker count of bad certificates", }) - return &metrics{checkerLatency, checkerTimestamp, checkerGoodCount, checkerBadCount} + return &certCheckerMetrics{checkerLatency, checkerTimestamp, checkerGoodCount, checkerBadCount} } // For defense-in-depth in addition to using the PA & its identPolicy to check diff --git a/cmd/cert-checker/main_test.go b/cmd/cert-checker/main_test.go index 5f888b1ea78..8aabc519bb2 100644 --- a/cmd/cert-checker/main_test.go +++ b/cmd/cert-checker/main_test.go @@ -31,7 +31,7 @@ import ( "github.com/letsencrypt/boulder/identifier" "github.com/letsencrypt/boulder/linter" blog "github.com/letsencrypt/boulder/log" - m "github.com/letsencrypt/boulder/metrics" + "github.com/letsencrypt/boulder/metrics" "github.com/letsencrypt/boulder/policy" "github.com/letsencrypt/boulder/sa" sapb "github.com/letsencrypt/boulder/sa/proto" @@ -337,7 +337,7 @@ func TestGetAndProcessCerts(t *testing.T) { mocklog := blog.NewMock() checker := newChecker(saDbMap, fc, pa, kp, time.Hour, testValidityDurations, nil, mocklog) - sa, err := sa.NewSQLStorageAuthority(saDbMap, saDbMap, nil, 0, fc, blog.NewMock(), m.NoopRegisterer) + sa, err := sa.NewSQLStorageAuthority(saDbMap, saDbMap, nil, 0, fc, blog.NewMock(), metrics.NoopRegisterer) test.AssertNotError(t, err, "Couldn't create SA to insert certificates") saCleanUp := test.ResetBoulderTestDatabase(t) defer func() { From 02e95e5cfdd388e2bf4cab4ad4e26ab4b1f2583a Mon Sep 17 00:00:00 2001 From: Lena Date: Fri, 29 May 2026 11:37:11 -0700 Subject: [PATCH 13/20] lookup pushgatewayurl --- cmd/cert-checker/main.go | 64 +++++++++++++++++++++++++++--- test/config-next/cert-checker.json | 1 - test/config/cert-checker.json | 1 - 3 files changed, 58 insertions(+), 8 deletions(-) diff --git a/cmd/cert-checker/main.go b/cmd/cert-checker/main.go index f254aa9c0f4..6a74272c746 100644 --- a/cmd/cert-checker/main.go +++ b/cmd/cert-checker/main.go @@ -6,10 +6,12 @@ import ( "crypto/x509" "flag" "fmt" + "net" "net/netip" "os" "regexp" "slices" + "strings" "sync" "sync/atomic" "time" @@ -542,8 +544,20 @@ type Config struct { DB cmd.DBConfig cmd.HostnamePolicyConfig - Workers int `validate:"required,min=1"` - PushgatewayURL string `validate:"omitempty,url"` + Workers int `validate:"required,min=1"` + // LookupDNSAuthority can only be specified with PushgatewayService. It's a single + // : of the DNS server to be used for resolution + // of pushgateway backends. If the address contains a hostname it will be resolved + // using system DNS. If the address contains a port, the client will use it + // directly, otherwise port 53 is used. + LookupDNSAuthority string `validate:"required_with=PushgatewayService,ip|hostname|hostname_port"` + // PushgatewayService entry contains a service and domain name that will be used + // to construct a SRV DNS query to lookup pushgateway backends. For example: if + // the resource record is 'foo.service.consul', then the 'Service' is 'foo' + // and the 'Domain' is 'service.consul'. The expected dNSName to be + // authenticated in the server certificate would be 'foo.service.consul'. + PushgatewayService cmd.ServiceDomain `validate:"required_with=LookupDNSAuthority"` + PushgatewayScheme string `validate:"required_with=PushgatewayService,oneof=http https"` // Deprecated: cert-checker only logs bad results anyway. BadResultsOnly bool CheckPeriod config.Duration @@ -580,6 +594,39 @@ type Config struct { Syslog cmd.SyslogConfig } +func getPushgatewayURL(dnsAuthority, scheme string, svc cmd.ServiceDomain) (string, error) { + host, port, err := net.SplitHostPort(dnsAuthority) + if err != nil { + // Assume only hostname or IPv4 address was specified. + host = dnsAuthority + port = "53" + } + r := &net.Resolver{ + PreferGo: true, + Dial: func(ctx context.Context, network, _ string) (net.Conn, error) { + return (&net.Dialer{}).DialContext(ctx, network, net.JoinHostPort(host, port)) + }, + } + lookupCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + _, targets, err := r.LookupSRV(lookupCtx, svc.Service, "tcp", svc.Domain) + if err != nil { + return "", fmt.Errorf("SRV lookup of _%s._tcp.%s failed: %w", svc.Service, svc.Domain, err) + } + if len(targets) == 0 { + return "", fmt.Errorf("SRV lookup of _%s._tcp.%s returned 0 results", svc.Service, svc.Domain) + } + target := strings.TrimSuffix(targets[0].Target, ".") + addrs, err := r.LookupHost(lookupCtx, target) + if err != nil { + return "", fmt.Errorf("A/AAAA lookup of %q failed: %w", target, err) + } + if len(addrs) == 0 { + return "", fmt.Errorf("A/AAAA lookup of %q returned 0 results", target) + } + return fmt.Sprintf("%s://%s", scheme, net.JoinHostPort(addrs[0], fmt.Sprint(targets[0].Port))), nil +} + func main() { configFile := flag.String("config", "", "File path to the configuration file for this service") flag.Parse() @@ -677,12 +724,17 @@ func main() { metrics.checkerGoodCount.Set(float64(checker.issuedReport.GoodCerts)) metrics.checkerBadCount.Set(float64(checker.issuedReport.BadCerts)) - if config.CertChecker.PushgatewayURL != "" { - err = cmd.PushMetrics("cert-checker", config.CertChecker.PushgatewayURL, reg, logger) + if config.CertChecker.PushgatewayService.Service != "" { + pushgatewayURL, err := getPushgatewayURL(config.CertChecker.LookupDNSAuthority, config.CertChecker.PushgatewayScheme, config.CertChecker.PushgatewayService) if err != nil { - logger.Errf("failed to push metrics to pushgateway: %s", err) + logger.Errf("failed to get pushgateway URL: %s", err) } else { - logger.Debugf("pushed metrics to pushgateway at %s", config.CertChecker.PushgatewayURL) + err = cmd.PushMetrics("cert-checker", pushgatewayURL, reg, logger) + if err != nil { + logger.Errf("failed to push metrics to pushgateway: %s", err) + } else { + logger.Debugf("pushed metrics to pushgateway at %s", pushgatewayURL) + } } } diff --git a/test/config-next/cert-checker.json b/test/config-next/cert-checker.json index 8dd7d6e6ea5..8dccca4e23a 100644 --- a/test/config-next/cert-checker.json +++ b/test/config-next/cert-checker.json @@ -4,7 +4,6 @@ "dbConnectFile": "test/secrets/cert_checker_dburl", "maxOpenConns": 10 }, - "pushgatewayURL": "", "hostnamePolicyFile": "test/ident-policy.yaml", "workers": 16, "badResultsOnly": true, diff --git a/test/config/cert-checker.json b/test/config/cert-checker.json index 747f5eefe76..4f51507f249 100644 --- a/test/config/cert-checker.json +++ b/test/config/cert-checker.json @@ -4,7 +4,6 @@ "dbConnectFile": "test/secrets/cert_checker_dburl", "maxOpenConns": 10 }, - "pushgatewayURL": "", "hostnamePolicyFile": "test/ident-policy.yaml", "workers": 16, "badResultsOnly": true, From 90f7d8f9ca629396cb543472965eb46975705f9b Mon Sep 17 00:00:00 2001 From: Lena Date: Fri, 29 May 2026 11:49:28 -0700 Subject: [PATCH 14/20] update key validations --- cmd/cert-checker/main.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cmd/cert-checker/main.go b/cmd/cert-checker/main.go index 6a74272c746..3a517e2cb6c 100644 --- a/cmd/cert-checker/main.go +++ b/cmd/cert-checker/main.go @@ -550,14 +550,14 @@ type Config struct { // of pushgateway backends. If the address contains a hostname it will be resolved // using system DNS. If the address contains a port, the client will use it // directly, otherwise port 53 is used. - LookupDNSAuthority string `validate:"required_with=PushgatewayService,ip|hostname|hostname_port"` + LookupDNSAuthority string `validate:"required_with=PushgatewayService,omitempty,ip|hostname|hostname_port"` // PushgatewayService entry contains a service and domain name that will be used // to construct a SRV DNS query to lookup pushgateway backends. For example: if // the resource record is 'foo.service.consul', then the 'Service' is 'foo' // and the 'Domain' is 'service.consul'. The expected dNSName to be // authenticated in the server certificate would be 'foo.service.consul'. - PushgatewayService cmd.ServiceDomain `validate:"required_with=LookupDNSAuthority"` - PushgatewayScheme string `validate:"required_with=PushgatewayService,oneof=http https"` + PushgatewayService *cmd.ServiceDomain `validate:"required_with=LookupDNSAuthority"` + PushgatewayScheme string `validate:"required_with=PushgatewayService,omitempty,oneof=http https"` // Deprecated: cert-checker only logs bad results anyway. BadResultsOnly bool CheckPeriod config.Duration @@ -725,7 +725,7 @@ func main() { metrics.checkerBadCount.Set(float64(checker.issuedReport.BadCerts)) if config.CertChecker.PushgatewayService.Service != "" { - pushgatewayURL, err := getPushgatewayURL(config.CertChecker.LookupDNSAuthority, config.CertChecker.PushgatewayScheme, config.CertChecker.PushgatewayService) + pushgatewayURL, err := getPushgatewayURL(config.CertChecker.LookupDNSAuthority, config.CertChecker.PushgatewayScheme, *config.CertChecker.PushgatewayService) if err != nil { logger.Errf("failed to get pushgateway URL: %s", err) } else { From c50dce2eab8339e8faa06fd4367f301fcacdbc97 Mon Sep 17 00:00:00 2001 From: Lena Date: Fri, 29 May 2026 11:58:42 -0700 Subject: [PATCH 15/20] fix nil pointer ref --- cmd/cert-checker/main.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmd/cert-checker/main.go b/cmd/cert-checker/main.go index 3a517e2cb6c..95f4a054760 100644 --- a/cmd/cert-checker/main.go +++ b/cmd/cert-checker/main.go @@ -724,7 +724,7 @@ func main() { metrics.checkerGoodCount.Set(float64(checker.issuedReport.GoodCerts)) metrics.checkerBadCount.Set(float64(checker.issuedReport.BadCerts)) - if config.CertChecker.PushgatewayService.Service != "" { + if config.CertChecker.PushgatewayService != nil { pushgatewayURL, err := getPushgatewayURL(config.CertChecker.LookupDNSAuthority, config.CertChecker.PushgatewayScheme, *config.CertChecker.PushgatewayService) if err != nil { logger.Errf("failed to get pushgateway URL: %s", err) From 4b4d70ec7d5d46361378f5f566d941a2551c2d39 Mon Sep 17 00:00:00 2001 From: lenaunderwood22 <46858616+lenaunderwood22@users.noreply.github.com> Date: Fri, 29 May 2026 15:34:53 -0700 Subject: [PATCH 16/20] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- cmd/cert-checker/main.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmd/cert-checker/main.go b/cmd/cert-checker/main.go index 95f4a054760..36285c304d6 100644 --- a/cmd/cert-checker/main.go +++ b/cmd/cert-checker/main.go @@ -550,14 +550,14 @@ type Config struct { // of pushgateway backends. If the address contains a hostname it will be resolved // using system DNS. If the address contains a port, the client will use it // directly, otherwise port 53 is used. - LookupDNSAuthority string `validate:"required_with=PushgatewayService,omitempty,ip|hostname|hostname_port"` + LookupDNSAuthority string `validate:"excluded_without=PushgatewayService,required_with=PushgatewayService,omitempty,ip|hostname|hostname_port"` // PushgatewayService entry contains a service and domain name that will be used // to construct a SRV DNS query to lookup pushgateway backends. For example: if // the resource record is 'foo.service.consul', then the 'Service' is 'foo' // and the 'Domain' is 'service.consul'. The expected dNSName to be // authenticated in the server certificate would be 'foo.service.consul'. PushgatewayService *cmd.ServiceDomain `validate:"required_with=LookupDNSAuthority"` - PushgatewayScheme string `validate:"required_with=PushgatewayService,omitempty,oneof=http https"` + PushgatewayScheme string `validate:"excluded_without=PushgatewayService,required_with=PushgatewayService,omitempty,oneof=http https"` // Deprecated: cert-checker only logs bad results anyway. BadResultsOnly bool CheckPeriod config.Duration From 2a03d31e74da1c4c632fd5f88d75de9010cc30df Mon Sep 17 00:00:00 2001 From: Lena Date: Fri, 29 May 2026 15:52:12 -0700 Subject: [PATCH 17/20] context with 10s timeout for lookups & push --- cmd/cert-checker/main.go | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/cmd/cert-checker/main.go b/cmd/cert-checker/main.go index 36285c304d6..a2e5b2f2997 100644 --- a/cmd/cert-checker/main.go +++ b/cmd/cert-checker/main.go @@ -594,7 +594,7 @@ type Config struct { Syslog cmd.SyslogConfig } -func getPushgatewayURL(dnsAuthority, scheme string, svc cmd.ServiceDomain) (string, error) { +func getPushgatewayURL(ctx context.Context, dnsAuthority, scheme string, svc cmd.ServiceDomain) (string, error) { host, port, err := net.SplitHostPort(dnsAuthority) if err != nil { // Assume only hostname or IPv4 address was specified. @@ -607,9 +607,7 @@ func getPushgatewayURL(dnsAuthority, scheme string, svc cmd.ServiceDomain) (stri return (&net.Dialer{}).DialContext(ctx, network, net.JoinHostPort(host, port)) }, } - lookupCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second) - defer cancel() - _, targets, err := r.LookupSRV(lookupCtx, svc.Service, "tcp", svc.Domain) + _, targets, err := r.LookupSRV(ctx, svc.Service, "tcp", svc.Domain) if err != nil { return "", fmt.Errorf("SRV lookup of _%s._tcp.%s failed: %w", svc.Service, svc.Domain, err) } @@ -617,7 +615,7 @@ func getPushgatewayURL(dnsAuthority, scheme string, svc cmd.ServiceDomain) (stri return "", fmt.Errorf("SRV lookup of _%s._tcp.%s returned 0 results", svc.Service, svc.Domain) } target := strings.TrimSuffix(targets[0].Target, ".") - addrs, err := r.LookupHost(lookupCtx, target) + addrs, err := r.LookupHost(ctx, target) if err != nil { return "", fmt.Errorf("A/AAAA lookup of %q failed: %w", target, err) } @@ -725,7 +723,9 @@ func main() { metrics.checkerBadCount.Set(float64(checker.issuedReport.BadCerts)) if config.CertChecker.PushgatewayService != nil { - pushgatewayURL, err := getPushgatewayURL(config.CertChecker.LookupDNSAuthority, config.CertChecker.PushgatewayScheme, *config.CertChecker.PushgatewayService) + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + pushgatewayURL, err := getPushgatewayURL(ctx, config.CertChecker.LookupDNSAuthority, config.CertChecker.PushgatewayScheme, *config.CertChecker.PushgatewayService) if err != nil { logger.Errf("failed to get pushgateway URL: %s", err) } else { From dad8b8c7edc20591229003c02a5b46eb9dcddd40 Mon Sep 17 00:00:00 2001 From: Lena Date: Fri, 29 May 2026 16:14:04 -0700 Subject: [PATCH 18/20] hardcode http --- cmd/cert-checker/main.go | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/cmd/cert-checker/main.go b/cmd/cert-checker/main.go index a2e5b2f2997..afa929bacea 100644 --- a/cmd/cert-checker/main.go +++ b/cmd/cert-checker/main.go @@ -557,7 +557,6 @@ type Config struct { // and the 'Domain' is 'service.consul'. The expected dNSName to be // authenticated in the server certificate would be 'foo.service.consul'. PushgatewayService *cmd.ServiceDomain `validate:"required_with=LookupDNSAuthority"` - PushgatewayScheme string `validate:"excluded_without=PushgatewayService,required_with=PushgatewayService,omitempty,oneof=http https"` // Deprecated: cert-checker only logs bad results anyway. BadResultsOnly bool CheckPeriod config.Duration @@ -594,7 +593,7 @@ type Config struct { Syslog cmd.SyslogConfig } -func getPushgatewayURL(ctx context.Context, dnsAuthority, scheme string, svc cmd.ServiceDomain) (string, error) { +func getPushgatewayURL(ctx context.Context, dnsAuthority string, svc cmd.ServiceDomain) (string, error) { host, port, err := net.SplitHostPort(dnsAuthority) if err != nil { // Assume only hostname or IPv4 address was specified. @@ -622,7 +621,7 @@ func getPushgatewayURL(ctx context.Context, dnsAuthority, scheme string, svc cmd if len(addrs) == 0 { return "", fmt.Errorf("A/AAAA lookup of %q returned 0 results", target) } - return fmt.Sprintf("%s://%s", scheme, net.JoinHostPort(addrs[0], fmt.Sprint(targets[0].Port))), nil + return fmt.Sprintf("http://%s", net.JoinHostPort(addrs[0], fmt.Sprint(targets[0].Port))), nil } func main() { @@ -725,7 +724,7 @@ func main() { if config.CertChecker.PushgatewayService != nil { ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) defer cancel() - pushgatewayURL, err := getPushgatewayURL(ctx, config.CertChecker.LookupDNSAuthority, config.CertChecker.PushgatewayScheme, *config.CertChecker.PushgatewayService) + pushgatewayURL, err := getPushgatewayURL(ctx, config.CertChecker.LookupDNSAuthority, *config.CertChecker.PushgatewayService) if err != nil { logger.Errf("failed to get pushgateway URL: %s", err) } else { From ad64b0808bb5ffce98d516a4c21f4108043fba83 Mon Sep 17 00:00:00 2001 From: Lena Date: Fri, 29 May 2026 16:40:59 -0700 Subject: [PATCH 19/20] add comments to getPushgatewayURL, add tests for it --- cmd/cert-checker/main.go | 10 +++++++++ cmd/cert-checker/main_test.go | 42 +++++++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+) diff --git a/cmd/cert-checker/main.go b/cmd/cert-checker/main.go index afa929bacea..4e4e880c9e3 100644 --- a/cmd/cert-checker/main.go +++ b/cmd/cert-checker/main.go @@ -593,6 +593,13 @@ type Config struct { Syslog cmd.SyslogConfig } +// getPushgatewayURL resolves svc via SRV+A lookups against dnsAuthority and +// returns an http:// URL whose host is an IP address. Both lookups go through +// dnsAuthority (typically Consul DNS) because the system resolver can't answer +// queries for the .consul domain. The SRV target is then flattened to an IP +// because the returned URL is consumed by net/http via cmd.PushMetrics, which +// resolves hostnames using the system resolver. Scheme is fixed to http: +// pushgateway is assumed to be on an internal network func getPushgatewayURL(ctx context.Context, dnsAuthority string, svc cmd.ServiceDomain) (string, error) { host, port, err := net.SplitHostPort(dnsAuthority) if err != nil { @@ -613,6 +620,9 @@ func getPushgatewayURL(ctx context.Context, dnsAuthority string, svc cmd.Service if len(targets) == 0 { return "", fmt.Errorf("SRV lookup of _%s._tcp.%s returned 0 results", svc.Service, svc.Domain) } + // Flatten the SRV target to an IP using the same Consul authority; net/http + // (used downstream) would otherwise try to resolve names like + // *.addr.dc1.consul via the system resolver and fail. target := strings.TrimSuffix(targets[0].Target, ".") addrs, err := r.LookupHost(ctx, target) if err != nil { diff --git a/cmd/cert-checker/main_test.go b/cmd/cert-checker/main_test.go index 8aabc519bb2..3fdd90fe77f 100644 --- a/cmd/cert-checker/main_test.go +++ b/cmd/cert-checker/main_test.go @@ -14,6 +14,8 @@ import ( "log" "math/big" mrand "math/rand/v2" + "net" + "net/url" "os" "slices" "strings" @@ -23,6 +25,7 @@ import ( "github.com/jmhodges/clock" "google.golang.org/protobuf/types/known/timestamppb" + "github.com/letsencrypt/boulder/cmd" "github.com/letsencrypt/boulder/core" corepb "github.com/letsencrypt/boulder/core/proto" "github.com/letsencrypt/boulder/ctpolicy/loglist" @@ -672,3 +675,42 @@ func TestPrecertCorrespond(t *testing.T) { } t.Fatalf("expected precert correspondence problem, but got: %v", problems) } + +func TestGetPushgatewayURL(t *testing.T) { + ctx := context.Background() + t.Run("happy path", func(t *testing.T) { + gotURL, err := getPushgatewayURL(ctx, "consul.service.consul:52", + cmd.ServiceDomain{Service: "redisratelimits", Domain: "service.consul"}) + test.AssertNotError(t, err, "") + + parsed, err := url.Parse(gotURL) + test.AssertNotError(t, err, "returned URL should be parseable") + test.AssertEquals(t, parsed.Scheme, "http") + + host, port, err := net.SplitHostPort(parsed.Host) + test.AssertNotError(t, err, "URL host should contain a port") + test.AssertNotNil(t, net.ParseIP(host), "host should be an IP, not a hostname (LookupHost flatten step)") + test.AssertEquals(t, port, "52") + }) + t.Run("DNS authority no port specified", func(t *testing.T) { + gotURL, err := getPushgatewayURL(ctx, "consul.service.consul", + cmd.ServiceDomain{Service: "redisratelimits", Domain: "service.consul"}) + test.AssertError(t, err, "") + + parsed, err := url.Parse(gotURL) + test.AssertNotError(t, err, "returned URL should be parseable") + _, port, err := net.SplitHostPort(parsed.Host) + test.AssertNotError(t, err, "URL host should contain a port") + test.AssertEquals(t, port, "53") + }) + t.Run("SRV not found", func(t *testing.T) { + _, err := getPushgatewayURL(ctx, "consul.service.consul:53", + cmd.ServiceDomain{Service: "doesnotexist", Domain: "service.consul"}) + test.AssertError(t, err, "") + }) + t.Run("DNS authority unreachable", func(t *testing.T) { + _, err := getPushgatewayURL(ctx, "doesnotexist.invalid:53", + cmd.ServiceDomain{Service: "redisratelimits", Domain: "service.consul"}) + test.AssertError(t, err, "") + }) +} From 6c75a9c97744c9e792d3c91d40edae320811e544 Mon Sep 17 00:00:00 2001 From: Lena Date: Fri, 29 May 2026 16:52:33 -0700 Subject: [PATCH 20/20] test fixes --- cmd/cert-checker/main_test.go | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/cmd/cert-checker/main_test.go b/cmd/cert-checker/main_test.go index 3fdd90fe77f..4c6f08141db 100644 --- a/cmd/cert-checker/main_test.go +++ b/cmd/cert-checker/main_test.go @@ -18,6 +18,7 @@ import ( "net/url" "os" "slices" + "strconv" "strings" "testing" "time" @@ -679,7 +680,7 @@ func TestPrecertCorrespond(t *testing.T) { func TestGetPushgatewayURL(t *testing.T) { ctx := context.Background() t.Run("happy path", func(t *testing.T) { - gotURL, err := getPushgatewayURL(ctx, "consul.service.consul:52", + gotURL, err := getPushgatewayURL(ctx, "consul.service.consul:53", cmd.ServiceDomain{Service: "redisratelimits", Domain: "service.consul"}) test.AssertNotError(t, err, "") @@ -689,19 +690,15 @@ func TestGetPushgatewayURL(t *testing.T) { host, port, err := net.SplitHostPort(parsed.Host) test.AssertNotError(t, err, "URL host should contain a port") - test.AssertNotNil(t, net.ParseIP(host), "host should be an IP, not a hostname (LookupHost flatten step)") - test.AssertEquals(t, port, "52") + test.AssertNotNil(t, net.ParseIP(host), "host should be an IP (LookupHost flatten step)") + portNum, err := strconv.Atoi(port) + test.AssertNotError(t, err, "port should be numeric") + test.Assert(t, portNum > 0 && portNum < 65536, "port should be in valid range") }) t.Run("DNS authority no port specified", func(t *testing.T) { - gotURL, err := getPushgatewayURL(ctx, "consul.service.consul", + _, err := getPushgatewayURL(ctx, "consul.service.consul", cmd.ServiceDomain{Service: "redisratelimits", Domain: "service.consul"}) - test.AssertError(t, err, "") - - parsed, err := url.Parse(gotURL) - test.AssertNotError(t, err, "returned URL should be parseable") - _, port, err := net.SplitHostPort(parsed.Host) - test.AssertNotError(t, err, "URL host should contain a port") - test.AssertEquals(t, port, "53") + test.AssertNotError(t, err, "") }) t.Run("SRV not found", func(t *testing.T) { _, err := getPushgatewayURL(ctx, "consul.service.consul:53",