Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions test/extended/router/gatewayapi_upgrade.go
Original file line number Diff line number Diff line change
Expand Up @@ -138,9 +138,9 @@ func (t *GatewayAPIUpgradeTest) Setup(ctx context.Context, f *e2e.Framework) {
_, err = assertHttpRouteSuccessful(t.oc, t.gatewayName, t.routeName)
o.Expect(err).NotTo(o.HaveOccurred())

if t.loadBalancerSupported && t.managedDNS {
if t.loadBalancerSupported {
g.By("Verifying HTTP connectivity before upgrade")
assertHttpRouteConnection(t.hostname)
assertHttpRouteConnection(t.oc, t.gatewayName+"-openshift-default", t.hostname, t.loadBalancerSupported)
e2e.Logf("HTTPRoute connectivity verified before upgrade")
}
}
Expand Down Expand Up @@ -182,9 +182,9 @@ func (t *GatewayAPIUpgradeTest) Test(ctx context.Context, f *e2e.Framework, done
_, err = assertHttpRouteSuccessful(t.oc, t.gatewayName, t.routeName)
o.Expect(err).NotTo(o.HaveOccurred())

if t.loadBalancerSupported && t.managedDNS {
if t.loadBalancerSupported {
g.By("Verifying HTTP connectivity after upgrade")
assertHttpRouteConnection(t.hostname)
assertHttpRouteConnection(t.oc, t.gatewayName+"-openshift-default", t.hostname, t.loadBalancerSupported)
}

if migrationOccurred {
Expand Down
126 changes: 81 additions & 45 deletions test/extended/router/gatewayapicontroller.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ package router

import (
"context"
"crypto/tls"
"errors"
"fmt"
"net"
Expand Down Expand Up @@ -403,9 +402,9 @@ var _ = g.Describe("[sig-network-edge][OCPFeatureGate:GatewayAPIController][Feat
g.By("Checking the http route using the default gateway is accepted")
assertHttpRouteSuccessful(oc, gw, "test-httproute")

g.By("Validating the http connectivity to the backend application")
if loadBalancerSupported && managedDNS {
assertHttpRouteConnection(defaultRoutename)
if loadBalancerSupported {
g.By("Validating the http connectivity to the backend application")
assertHttpRouteConnection(oc, gw+"-openshift-default", defaultRoutename, loadBalancerSupported)
}
})

Expand Down Expand Up @@ -612,20 +611,13 @@ func getPlatformCapabilities(oc *exutil.CLI) (loadBalancerSupported bool, manage
loadBalancerSupported = false
}

managedDNS = isDNSManaged(oc)
managedDNS, err = isDNSManaged(oc, time.Minute)
o.Expect(err).NotTo(o.HaveOccurred(), "Failed to check if DNS is managed")

e2e.Logf("Platform: %s, LoadBalancer supported: %t, DNS managed: %t", infra.Status.PlatformStatus.Type, loadBalancerSupported, managedDNS)
return loadBalancerSupported, managedDNS
}

// isDNSManaged checks if the cluster has DNS zones configured (public or private).
// On platforms like vSphere without external DNS, DNS records cannot be managed.
func isDNSManaged(oc *exutil.CLI) bool {
dnsConfig, err := oc.AdminConfigClient().ConfigV1().DNSes().Get(context.Background(), "cluster", metav1.GetOptions{})
o.Expect(err).NotTo(o.HaveOccurred(), "Failed to get DNS config")
return dnsConfig.Spec.PrivateZone != nil || dnsConfig.Spec.PublicZone != nil
}

// isIPv6OrDualStack checks if the cluster is using IPv6 or dual-stack networking.
// Returns true if any ServiceNetwork CIDR is IPv6 (indicates IPv6-only or dual-stack).
func isIPv6OrDualStack(oc *exutil.CLI) (bool, error) {
Expand Down Expand Up @@ -753,18 +745,17 @@ func buildGateway(name, namespace, gcname, fromNs, domain string) *gatewayapiv1.
}
}

// assertGatewayLoadbalancerReady verifies that the given gateway has the service's load balancer address assigned.
func assertGatewayLoadbalancerReady(oc *exutil.CLI, gwName, gwServiceName string) {
// check gateway LB service, note that External-IP might be hostname (AWS) or IP (Azure/GCP)
// return LoadBalancer service address, note that External-IP might be hostname (AWS) or IP (Azure/GCP)
func getLoadBalancerAddress(oc *exutil.CLI, serviceName string) string {
var lbAddress string
err := wait.PollUntilContextTimeout(context.Background(), 1*time.Second, loadBalancerReadyTimeout, false, func(context context.Context) (bool, error) {
lbService, err := oc.AdminKubeClient().CoreV1().Services(ingressNamespace).Get(context, gwServiceName, metav1.GetOptions{})
lbService, err := oc.AdminKubeClient().CoreV1().Services(ingressNamespace).Get(context, serviceName, metav1.GetOptions{})
if err != nil {
e2e.Logf("Failed to get service %q: %v, retrying...", gwServiceName, err)
e2e.Logf("Failed to get service %q: %v, retrying...", serviceName, err)
return false, nil
}
if len(lbService.Status.LoadBalancer.Ingress) == 0 {
e2e.Logf("Service %q has no load balancer; retrying...", gwServiceName)
e2e.Logf("Service %q has no load balancer; retrying...", serviceName)
return false, nil
}
if lbService.Status.LoadBalancer.Ingress[0].Hostname != "" {
Expand All @@ -773,11 +764,20 @@ func assertGatewayLoadbalancerReady(oc *exutil.CLI, gwName, gwServiceName string
lbAddress = lbService.Status.LoadBalancer.Ingress[0].IP
}
if lbAddress == "" {
e2e.Logf("No load balancer address for service %q, retrying", gwServiceName)
e2e.Logf("No load balancer address for service %q, retrying", serviceName)
return false, nil
}
e2e.Logf("Got load balancer address for service %q: %v", gwServiceName, lbAddress)
return true, nil
})
o.Expect(err).NotTo(o.HaveOccurred(), "Timed out to get load balancer address of service %q", serviceName)
e2e.Logf("Got load balancer address for service %q: %v", serviceName, lbAddress)
return lbAddress
}

// assertGatewayLoadbalancerReady verifies that the given gateway has the service's load balancer address assigned.
func assertGatewayLoadbalancerReady(oc *exutil.CLI, gwName, gwServiceName string) {
lbAddress := getLoadBalancerAddress(oc, gwServiceName)
err := wait.PollUntilContextTimeout(context.Background(), 1*time.Second, loadBalancerReadyTimeout, false, func(context context.Context) (bool, error) {
gw, err := oc.AdminGatewayApiClient().GatewayV1().Gateways(ingressNamespace).Get(context, gwName, metav1.GetOptions{})
if err != nil {
e2e.Logf("Failed to get gateway %q: %v; retrying...", err, gwName)
Expand All @@ -795,10 +795,17 @@ func assertGatewayLoadbalancerReady(oc *exutil.CLI, gwName, gwServiceName string
o.Expect(err).NotTo(o.HaveOccurred(), "Timed out waiting for gateway %q to get load balancer address of service %q", gwName, gwServiceName)
}

// assertDNSRecordStatus polls until the DNSRecord's status in the default operand namespace is True.
// assertDNSRecordStatus polls until the DNSRecord's status in the default operand namespace is ready.
// When DNS is managed, it waits for all zones to have Published=True.
// When DNS is unmanaged, it waits for all zones to have Published!=True (expected in custom-dns clusters).
func assertDNSRecordStatus(oc *exutil.CLI, gatewayName string) {
// find the DNS Record and confirm its zone status is True
err := wait.PollUntilContextTimeout(context.Background(), 2*time.Second, 10*time.Minute, false, func(context context.Context) (bool, error) {
dnsManaged, err := isDNSManaged(oc, time.Minute)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we are mixing concepts for DNSManagement here.

Previously isDNSManaged checked whether the zones existed in the dnsConfig - it was probably a mistake to call it isDNSManaged - should have been isDNSSupported or isDNSZoneConfigured more generically.

With this PR, isDNSManaged is checking the default IngressController for the DNSManaged condition being set to False.

The problem is that Gateway API has its own distinct API, configuration, and reconciliation logic, separate from IngressControllers. So, though it may work by coincidence, it feels a bit fragile and might bite us on future test updates.

Consider a scenario where another test has modified the default IngressController while the cluster has DNS zones configured (PublicZone/PrivateZone are set). For example:

  • Changed the endpoint publishing strategy to HostNetwork or NodePort
  • Set spec.endpointPublishingStrategy.loadBalancer.dnsManagementPolicy: Unmanaged

In either case, the default IngressController's DNSManaged condition would be False. However, Gateway API would still be creating and publishing DNSRecords successfully, since zones exist and the domain matches. The test would incorrectly skip DNS validation or bypass DNS resolution by dialing the LB IP directly.

My suggestion: For the Gateway API specific tests - I think we should keep the same logic from the old isDNSManaged (check dns.config.openshift.io/cluster PublicZone/PrivateZone), but rename it to isDNSZoneConfigured to avoid naming confusion over the DNSManaged IngressController condition.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, that definitely makes sense.

if err != nil {
e2e.Failf("Failed to get default ingresscontroller DNSManaged status: %v", err)
}

// find the DNS Record and confirm its zone status
err = wait.PollUntilContextTimeout(context.Background(), 2*time.Second, 10*time.Minute, false, func(context context.Context) (bool, error) {
gatewayDNSRecord := &operatoringressv1.DNSRecord{}
gatewayDNSRecords, err := oc.AdminIngressClient().IngressV1().DNSRecords(ingressNamespace).List(context, metav1.ListOptions{})
if err != nil {
Expand All @@ -810,20 +817,43 @@ func assertDNSRecordStatus(oc *exutil.CLI, gatewayName string) {
for _, record := range gatewayDNSRecords.Items {
if record.Labels["gateway.networking.k8s.io/gateway-name"] == gatewayName {
gatewayDNSRecord = &record
e2e.Logf("Found the desired dnsrecord and spec is: %v", gatewayDNSRecord.Spec)
break
}
}

// checking the gateway DNS record status
if len(gatewayDNSRecord.Status.Zones) == 0 {
e2e.Logf("DNS record %q has no zones yet, retrying...", gatewayDNSRecord.Name)
return false, nil
}
Comment on lines +825 to +828
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So we are now enabling assertDNSRecordStatus to check DNSRecords for unmanaged clusters in this PR - however, this logic will never pass because (see comment above) Gateway API will be unmanaged only when there are no zones. So this will get stuck.

I think we have two options:

  1. Keep it simple - don't call assertDNSRecordStatus for unsupported DNS clusters (and remove all changes to assertDNSRecordStatus)
  2. Or, you need to check dnsManaged here, and return early - there are no zones to check

Gateway API DNS logic is simpler than IngressController: the DNSRecord is always created, and whether it actually publishes depends only on whether zones exist and the domain matches baseDomain. There's no per-Gateway opt-out mechanism like the IngressController's dnsManagementPolicy or endpointPublishingStrategy fields.


// Check the Published condition for each zone
for _, zone := range gatewayDNSRecord.Status.Zones {
published := false
for _, condition := range zone.Conditions {
if condition.Type == "Published" && condition.Status == "True" {
return true, nil
if condition.Type != "Published" {
continue
}
e2e.Logf("The published status is %v for zone %v", condition.Status, zone.DNSZone)
if dnsManaged && condition.Status != "True" {
e2e.Logf("DNS record %q zone %v is not published yet, retrying...", gatewayDNSRecord.Name, zone.DNSZone)
return false, nil
}
if !dnsManaged && condition.Status == "True" {
e2e.Logf("DNS record %q zone %v is unexpectedly published for unmanaged DNS, retrying...", gatewayDNSRecord.Name, zone.DNSZone)
return false, nil
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not knowing the Gateway API code, would you be OK ignoring this condition and recording as a debug/info message instead of reporting this as an error?

}
published = true
break
}
if !published {
e2e.Logf("DNS record %q zone %v has no Published condition, retrying...", gatewayDNSRecord.Name, zone.DNSZone)
return false, nil
}
}
e2e.Logf("DNS record %q is not ready, retrying...", gatewayDNSRecord.Name)
return false, nil

e2e.Logf("All zones are checked and DNS record %q is ready", gatewayDNSRecord.Name)
return true, nil
})
o.Expect(err).NotTo(o.HaveOccurred(), "Timed out waiting for gateway %q DNSRecord to become ready", gatewayName)
}
Expand Down Expand Up @@ -1041,24 +1071,30 @@ func assertHttpRouteSuccessful(oc *exutil.CLI, gwName, name string) (*gatewayapi

// assertHttpRouteConnection checks if the http route of the given name replies successfully,
// and returns an error if not
func assertHttpRouteConnection(hostname string) {
// Create the http client to check the response status code.
client := &http.Client{
Timeout: 10 * time.Second,
Transport: &http.Transport{
TLSClientConfig: &tls.Config{InsecureSkipVerify: true},
},
func assertHttpRouteConnection(oc *exutil.CLI, gwServiceName, hostname string, loadBalancerSupported bool) {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit we guard against calling this function by checking loadBalancerSupported, but then we also now pass it in - is there a reason to do both?

The branch for e2e.Failf("Platform does not support load balancers and DNS is unmanaged - cannot verify HTTP route connectivity") will never get reached. I understand it's a safe guard, but in practice our code is getting a bit too branchy - and simpler is better IMO.

I think you should do one or the other. If you pass it in - rather than fail just return early with a log saying "skipping this...because..." or just remove it entirely - we already guard against calling it. I'd say just remove it as a new function parameter and keep this logic simple.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree. Fixing that.

isDNSManaged, err := isDNSManaged(oc, time.Minute)
if err != nil {
e2e.Failf("Failed to get default ingresscontroller DNSManaged status: %v", err)
}
lbAddress := ""
if isDNSManaged {
err := wait.PollUntilContextTimeout(context.Background(), 20*time.Second, dnsResolutionTimeout, false, func(context context.Context) (bool, error) {
_, err := net.LookupHost(hostname)
if err != nil {
e2e.Logf("[%v] Failed to resolve HTTP route's hostname %q: %v, retrying...", time.Now(), hostname, err)
return false, nil
}
return true, nil
})
o.Expect(err).NotTo(o.HaveOccurred(), "Timed out waiting for HTTP route's hostname %q to be resolved: %v", hostname, err)
} else if loadBalancerSupported {
lbAddress = getLoadBalancerAddress(oc, gwServiceName)
} else {
e2e.Failf("Platform does not support load balancers and DNS is unmanaged - cannot verify HTTP route connectivity")
}

err := wait.PollUntilContextTimeout(context.Background(), 20*time.Second, dnsResolutionTimeout, false, func(context context.Context) (bool, error) {
_, err := net.LookupHost(hostname)
if err != nil {
e2e.Logf("[%v] Failed to resolve HTTP route's hostname %q: %v, retrying...", time.Now(), hostname, err)
return false, nil
}
return true, nil
})
o.Expect(err).NotTo(o.HaveOccurred(), "Timed out waiting for HTTP route's hostname %q to be resolved: %v", hostname, err)
// Create the http client to check the response status code.
client := makeHTTPClient(false, 10*time.Second, lbAddress)

// Wait for http route to respond, and when it does, check for the status code.
err = wait.PollUntilContextTimeout(context.Background(), 5*time.Second, 5*time.Minute, false, func(context context.Context) (bool, error) {
Expand Down
20 changes: 15 additions & 5 deletions test/extended/router/grpc-interop.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,8 @@ var _ = g.Describe("[sig-network-edge][Conformance][Area:Networking][Feature:Rou
g.Skip("Skip on platforms where the default router is not exposed by a load balancer service.")
}

defaultDomain, err := getDefaultIngressClusterDomainName(oc, time.Minute)
o.Expect(err).NotTo(o.HaveOccurred(), "failed to find default domain name")
baseDomain, err := getClusterBaseDomainName(oc, time.Minute)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't quite see the point of changing these domains - I get that using defaultDomain as the parent domain causes DNS to point to the default IngressController (due to the CoreDNS Internal Static DNS resolver).

However, we can't use the CoreDNS Internal resolves anyways - we bypass it when we detect isDNSManaged below.

So I think this change adds complexity without solving a real problem.

o.Expect(err).NotTo(o.HaveOccurred(), "failed to find base domain name")

g.By("Locating the canary image reference")
image, err := getCanaryImage(oc)
Expand Down Expand Up @@ -196,7 +196,7 @@ var _ = g.Describe("[sig-network-edge][Conformance][Area:Networking][Feature:Rou
pemCrt2, err := certgen.MarshalCertToPEMString(tlsCrt2Data)
o.Expect(err).NotTo(o.HaveOccurred())

shardFQDN := oc.Namespace() + "." + defaultDomain
shardFQDN := oc.Namespace() + "." + baseDomain

g.By("Creating routes to test for gRPC interoperability")
routeType := oc.Namespace()
Expand Down Expand Up @@ -330,6 +330,15 @@ var _ = g.Describe("[sig-network-edge][Conformance][Area:Networking][Feature:Rou
o.Expect(shardService).NotTo(o.BeNil())
o.Expect(shardService.Status.LoadBalancer.Ingress).To(o.Not(o.BeEmpty()))

isDNSManaged, err := isDNSManaged(oc, time.Minute)
if err != nil {
e2e.Failf("Failed to get default ingresscontroller DNSManaged status: %v", err)
}
lbAddress := ""
if !isDNSManaged {
lbAddress = getLoadBalancerAddress(oc, "router-"+oc.Namespace())
}

testCases := []string{
"cancel_after_begin",
"cancel_after_first_response",
Expand All @@ -352,15 +361,15 @@ var _ = g.Describe("[sig-network-edge][Conformance][Area:Networking][Feature:Rou
routev1.TLSTerminationReencrypt,
routev1.TLSTerminationPassthrough,
} {
err := grpcExecTestCases(oc, routeType, 5*time.Minute, testCases...)
err := grpcExecTestCases(oc, routeType, 5*time.Minute, lbAddress, testCases...)
o.Expect(err).NotTo(o.HaveOccurred())
}
})
})
})

// grpcExecTestCases run gRPC interop test cases.
func grpcExecTestCases(oc *exutil.CLI, routeType routev1.TLSTerminationType, timeout time.Duration, testCases ...string) error {
func grpcExecTestCases(oc *exutil.CLI, routeType routev1.TLSTerminationType, timeout time.Duration, lbAddress string, testCases ...string) error {
host, err := getHostnameForRoute(oc, fmt.Sprintf("grpc-interop-%s", routeType))
if err != nil {
return err
Expand All @@ -371,6 +380,7 @@ func grpcExecTestCases(oc *exutil.CLI, routeType routev1.TLSTerminationType, tim
Port: 443,
UseTLS: true,
Insecure: true,
Target: lbAddress,
}

if routeType == "h2c" {
Expand Down
17 changes: 17 additions & 0 deletions test/extended/router/grpc-interop/clientconn.go
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
package grpc_interop

import (
"context"
"crypto/tls"
"crypto/x509"
"errors"
"fmt"
"net"
"strconv"

Expand All @@ -17,6 +19,8 @@ type DialParams struct {
Host string
Port int
Insecure bool
// Target is the actual IP you want to dial instead of resolving hostname
Target string
}

func Dial(cfg DialParams) (*grpc.ClientConn, error) {
Expand Down Expand Up @@ -44,5 +48,18 @@ func Dial(cfg DialParams) (*grpc.ClientConn, error) {
opts = append(opts, grpc.WithInsecure())
}

if cfg.Target != "" {
dialer := func(ctx context.Context, addr string) (net.Conn, error) {
_, port, err := net.SplitHostPort(addr)
if err != nil {
return nil, fmt.Errorf("failed to split host:port from %q: %w", addr, err)
}
// Connect to targetIP:port regardless of hostname
dest := net.JoinHostPort(cfg.Target, port)
return (&net.Dialer{}).DialContext(ctx, "tcp", dest)
}
opts = append(opts, grpc.WithContextDialer(dialer))
}

return grpc.Dial(net.JoinHostPort(cfg.Host, strconv.Itoa(cfg.Port)), append(opts, grpc.WithBlock())...)
}
8 changes: 4 additions & 4 deletions test/extended/router/h2spec.go
Original file line number Diff line number Diff line change
Expand Up @@ -80,9 +80,9 @@ var _ = g.Describe("[sig-network-edge][Conformance][Area:Networking][Feature:Rou
}
}

g.By("Getting the default domain")
defaultDomain, err := getDefaultIngressClusterDomainName(oc, time.Minute)
o.Expect(err).NotTo(o.HaveOccurred(), "failed to find default domain name")
g.By("Getting the base domain")
baseDomain, err := getClusterBaseDomainName(oc, time.Minute)
o.Expect(err).NotTo(o.HaveOccurred(), "failed to find base domain name")

g.By("Locating the router image reference")
routerImage, err := exutil.FindRouterImage(oc)
Expand Down Expand Up @@ -409,7 +409,7 @@ BFNBRELPe53ZdLKWpf2Sr96vRPRNw
e2e.ExpectNoError(e2epod.WaitForPodNameRunningInNamespace(context.TODO(), oc.KubeClient(), "h2spec-haproxy", oc.KubeFramework().Namespace.Name))
e2e.ExpectNoError(e2epod.WaitForPodNameRunningInNamespace(context.TODO(), oc.KubeClient(), "h2spec", oc.KubeFramework().Namespace.Name))

shardFQDN := oc.Namespace() + "." + defaultDomain
shardFQDN := oc.Namespace() + "." + baseDomain

// The new router shard is using a namespace
// selector so label this test namespace to
Expand Down
Loading