-
Notifications
You must be signed in to change notification settings - Fork 4.8k
OCPBUGS-59176: fix several failing tests in custom-dns jobs #31129
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2,7 +2,6 @@ package router | |
|
|
||
| import ( | ||
| "context" | ||
| "crypto/tls" | ||
| "errors" | ||
| "fmt" | ||
| "net" | ||
|
|
@@ -403,9 +402,9 @@ var _ = g.Describe("[sig-network-edge][OCPFeatureGate:GatewayAPIController][Feat | |
| g.By("Checking the http route using the default gateway is accepted") | ||
| assertHttpRouteSuccessful(oc, gw, "test-httproute") | ||
|
|
||
| g.By("Validating the http connectivity to the backend application") | ||
| if loadBalancerSupported && managedDNS { | ||
| assertHttpRouteConnection(defaultRoutename) | ||
| if loadBalancerSupported { | ||
| g.By("Validating the http connectivity to the backend application") | ||
| assertHttpRouteConnection(oc, gw+"-openshift-default", defaultRoutename, loadBalancerSupported) | ||
| } | ||
| }) | ||
|
|
||
|
|
@@ -612,20 +611,13 @@ func getPlatformCapabilities(oc *exutil.CLI) (loadBalancerSupported bool, manage | |
| loadBalancerSupported = false | ||
| } | ||
|
|
||
| managedDNS = isDNSManaged(oc) | ||
| managedDNS, err = isDNSManaged(oc, time.Minute) | ||
| o.Expect(err).NotTo(o.HaveOccurred(), "Failed to check if DNS is managed") | ||
|
|
||
| e2e.Logf("Platform: %s, LoadBalancer supported: %t, DNS managed: %t", infra.Status.PlatformStatus.Type, loadBalancerSupported, managedDNS) | ||
| return loadBalancerSupported, managedDNS | ||
| } | ||
|
|
||
| // isDNSManaged checks if the cluster has DNS zones configured (public or private). | ||
| // On platforms like vSphere without external DNS, DNS records cannot be managed. | ||
| func isDNSManaged(oc *exutil.CLI) bool { | ||
| dnsConfig, err := oc.AdminConfigClient().ConfigV1().DNSes().Get(context.Background(), "cluster", metav1.GetOptions{}) | ||
| o.Expect(err).NotTo(o.HaveOccurred(), "Failed to get DNS config") | ||
| return dnsConfig.Spec.PrivateZone != nil || dnsConfig.Spec.PublicZone != nil | ||
| } | ||
|
|
||
| // isIPv6OrDualStack checks if the cluster is using IPv6 or dual-stack networking. | ||
| // Returns true if any ServiceNetwork CIDR is IPv6 (indicates IPv6-only or dual-stack). | ||
| func isIPv6OrDualStack(oc *exutil.CLI) (bool, error) { | ||
|
|
@@ -753,18 +745,17 @@ func buildGateway(name, namespace, gcname, fromNs, domain string) *gatewayapiv1. | |
| } | ||
| } | ||
|
|
||
| // assertGatewayLoadbalancerReady verifies that the given gateway has the service's load balancer address assigned. | ||
| func assertGatewayLoadbalancerReady(oc *exutil.CLI, gwName, gwServiceName string) { | ||
| // check gateway LB service, note that External-IP might be hostname (AWS) or IP (Azure/GCP) | ||
| // return LoadBalancer service address, note that External-IP might be hostname (AWS) or IP (Azure/GCP) | ||
| func getLoadBalancerAddress(oc *exutil.CLI, serviceName string) string { | ||
| var lbAddress string | ||
| err := wait.PollUntilContextTimeout(context.Background(), 1*time.Second, loadBalancerReadyTimeout, false, func(context context.Context) (bool, error) { | ||
| lbService, err := oc.AdminKubeClient().CoreV1().Services(ingressNamespace).Get(context, gwServiceName, metav1.GetOptions{}) | ||
| lbService, err := oc.AdminKubeClient().CoreV1().Services(ingressNamespace).Get(context, serviceName, metav1.GetOptions{}) | ||
| if err != nil { | ||
| e2e.Logf("Failed to get service %q: %v, retrying...", gwServiceName, err) | ||
| e2e.Logf("Failed to get service %q: %v, retrying...", serviceName, err) | ||
| return false, nil | ||
| } | ||
| if len(lbService.Status.LoadBalancer.Ingress) == 0 { | ||
| e2e.Logf("Service %q has no load balancer; retrying...", gwServiceName) | ||
| e2e.Logf("Service %q has no load balancer; retrying...", serviceName) | ||
| return false, nil | ||
| } | ||
| if lbService.Status.LoadBalancer.Ingress[0].Hostname != "" { | ||
|
|
@@ -773,11 +764,20 @@ func assertGatewayLoadbalancerReady(oc *exutil.CLI, gwName, gwServiceName string | |
| lbAddress = lbService.Status.LoadBalancer.Ingress[0].IP | ||
| } | ||
| if lbAddress == "" { | ||
| e2e.Logf("No load balancer address for service %q, retrying", gwServiceName) | ||
| e2e.Logf("No load balancer address for service %q, retrying", serviceName) | ||
| return false, nil | ||
| } | ||
| e2e.Logf("Got load balancer address for service %q: %v", gwServiceName, lbAddress) | ||
| return true, nil | ||
| }) | ||
| o.Expect(err).NotTo(o.HaveOccurred(), "Timed out to get load balancer address of service %q", serviceName) | ||
| e2e.Logf("Got load balancer address for service %q: %v", serviceName, lbAddress) | ||
| return lbAddress | ||
| } | ||
|
|
||
| // assertGatewayLoadbalancerReady verifies that the given gateway has the service's load balancer address assigned. | ||
| func assertGatewayLoadbalancerReady(oc *exutil.CLI, gwName, gwServiceName string) { | ||
| lbAddress := getLoadBalancerAddress(oc, gwServiceName) | ||
| err := wait.PollUntilContextTimeout(context.Background(), 1*time.Second, loadBalancerReadyTimeout, false, func(context context.Context) (bool, error) { | ||
| gw, err := oc.AdminGatewayApiClient().GatewayV1().Gateways(ingressNamespace).Get(context, gwName, metav1.GetOptions{}) | ||
| if err != nil { | ||
| e2e.Logf("Failed to get gateway %q: %v; retrying...", err, gwName) | ||
|
|
@@ -795,10 +795,17 @@ func assertGatewayLoadbalancerReady(oc *exutil.CLI, gwName, gwServiceName string | |
| o.Expect(err).NotTo(o.HaveOccurred(), "Timed out waiting for gateway %q to get load balancer address of service %q", gwName, gwServiceName) | ||
| } | ||
|
|
||
| // assertDNSRecordStatus polls until the DNSRecord's status in the default operand namespace is True. | ||
| // assertDNSRecordStatus polls until the DNSRecord's status in the default operand namespace is ready. | ||
| // When DNS is managed, it waits for all zones to have Published=True. | ||
| // When DNS is unmanaged, it waits for all zones to have Published!=True (expected in custom-dns clusters). | ||
| func assertDNSRecordStatus(oc *exutil.CLI, gatewayName string) { | ||
| // find the DNS Record and confirm its zone status is True | ||
| err := wait.PollUntilContextTimeout(context.Background(), 2*time.Second, 10*time.Minute, false, func(context context.Context) (bool, error) { | ||
| dnsManaged, err := isDNSManaged(oc, time.Minute) | ||
| if err != nil { | ||
| e2e.Failf("Failed to get default ingresscontroller DNSManaged status: %v", err) | ||
| } | ||
|
|
||
| // find the DNS Record and confirm its zone status | ||
| err = wait.PollUntilContextTimeout(context.Background(), 2*time.Second, 10*time.Minute, false, func(context context.Context) (bool, error) { | ||
| gatewayDNSRecord := &operatoringressv1.DNSRecord{} | ||
| gatewayDNSRecords, err := oc.AdminIngressClient().IngressV1().DNSRecords(ingressNamespace).List(context, metav1.ListOptions{}) | ||
| if err != nil { | ||
|
|
@@ -810,20 +817,43 @@ func assertDNSRecordStatus(oc *exutil.CLI, gatewayName string) { | |
| for _, record := range gatewayDNSRecords.Items { | ||
| if record.Labels["gateway.networking.k8s.io/gateway-name"] == gatewayName { | ||
| gatewayDNSRecord = &record | ||
| e2e.Logf("Found the desired dnsrecord and spec is: %v", gatewayDNSRecord.Spec) | ||
| break | ||
| } | ||
| } | ||
|
|
||
| // checking the gateway DNS record status | ||
| if len(gatewayDNSRecord.Status.Zones) == 0 { | ||
| e2e.Logf("DNS record %q has no zones yet, retrying...", gatewayDNSRecord.Name) | ||
| return false, nil | ||
| } | ||
|
Comment on lines
+825
to
+828
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So we are now enabling I think we have two options:
Gateway API DNS logic is simpler than IngressController: the |
||
|
|
||
| // Check the Published condition for each zone | ||
| for _, zone := range gatewayDNSRecord.Status.Zones { | ||
| published := false | ||
| for _, condition := range zone.Conditions { | ||
| if condition.Type == "Published" && condition.Status == "True" { | ||
| return true, nil | ||
| if condition.Type != "Published" { | ||
| continue | ||
| } | ||
| e2e.Logf("The published status is %v for zone %v", condition.Status, zone.DNSZone) | ||
| if dnsManaged && condition.Status != "True" { | ||
| e2e.Logf("DNS record %q zone %v is not published yet, retrying...", gatewayDNSRecord.Name, zone.DNSZone) | ||
| return false, nil | ||
| } | ||
| if !dnsManaged && condition.Status == "True" { | ||
| e2e.Logf("DNS record %q zone %v is unexpectedly published for unmanaged DNS, retrying...", gatewayDNSRecord.Name, zone.DNSZone) | ||
| return false, nil | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not knowing the Gateway API code, would you be OK ignoring this condition and recording as a debug/info message instead of reporting this as an error? |
||
| } | ||
| published = true | ||
| break | ||
| } | ||
| if !published { | ||
| e2e.Logf("DNS record %q zone %v has no Published condition, retrying...", gatewayDNSRecord.Name, zone.DNSZone) | ||
| return false, nil | ||
| } | ||
| } | ||
| e2e.Logf("DNS record %q is not ready, retrying...", gatewayDNSRecord.Name) | ||
| return false, nil | ||
|
|
||
| e2e.Logf("All zones are checked and DNS record %q is ready", gatewayDNSRecord.Name) | ||
| return true, nil | ||
| }) | ||
| o.Expect(err).NotTo(o.HaveOccurred(), "Timed out waiting for gateway %q DNSRecord to become ready", gatewayName) | ||
| } | ||
|
|
@@ -1041,24 +1071,30 @@ func assertHttpRouteSuccessful(oc *exutil.CLI, gwName, name string) (*gatewayapi | |
|
|
||
| // assertHttpRouteConnection checks if the http route of the given name replies successfully, | ||
| // and returns an error if not | ||
| func assertHttpRouteConnection(hostname string) { | ||
| // Create the http client to check the response status code. | ||
| client := &http.Client{ | ||
| Timeout: 10 * time.Second, | ||
| Transport: &http.Transport{ | ||
| TLSClientConfig: &tls.Config{InsecureSkipVerify: true}, | ||
| }, | ||
| func assertHttpRouteConnection(oc *exutil.CLI, gwServiceName, hostname string, loadBalancerSupported bool) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit we guard against calling this function by checking The branch for I think you should do one or the other. If you pass it in - rather than fail just return early with a log saying "skipping this...because..." or just remove it entirely - we already guard against calling it. I'd say just remove it as a new function parameter and keep this logic simple.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I agree. Fixing that. |
||
| isDNSManaged, err := isDNSManaged(oc, time.Minute) | ||
| if err != nil { | ||
| e2e.Failf("Failed to get default ingresscontroller DNSManaged status: %v", err) | ||
| } | ||
| lbAddress := "" | ||
| if isDNSManaged { | ||
| err := wait.PollUntilContextTimeout(context.Background(), 20*time.Second, dnsResolutionTimeout, false, func(context context.Context) (bool, error) { | ||
| _, err := net.LookupHost(hostname) | ||
| if err != nil { | ||
| e2e.Logf("[%v] Failed to resolve HTTP route's hostname %q: %v, retrying...", time.Now(), hostname, err) | ||
| return false, nil | ||
| } | ||
| return true, nil | ||
| }) | ||
| o.Expect(err).NotTo(o.HaveOccurred(), "Timed out waiting for HTTP route's hostname %q to be resolved: %v", hostname, err) | ||
| } else if loadBalancerSupported { | ||
| lbAddress = getLoadBalancerAddress(oc, gwServiceName) | ||
| } else { | ||
| e2e.Failf("Platform does not support load balancers and DNS is unmanaged - cannot verify HTTP route connectivity") | ||
| } | ||
|
|
||
| err := wait.PollUntilContextTimeout(context.Background(), 20*time.Second, dnsResolutionTimeout, false, func(context context.Context) (bool, error) { | ||
| _, err := net.LookupHost(hostname) | ||
| if err != nil { | ||
| e2e.Logf("[%v] Failed to resolve HTTP route's hostname %q: %v, retrying...", time.Now(), hostname, err) | ||
| return false, nil | ||
| } | ||
| return true, nil | ||
| }) | ||
| o.Expect(err).NotTo(o.HaveOccurred(), "Timed out waiting for HTTP route's hostname %q to be resolved: %v", hostname, err) | ||
| // Create the http client to check the response status code. | ||
| client := makeHTTPClient(false, 10*time.Second, lbAddress) | ||
|
|
||
| // Wait for http route to respond, and when it does, check for the status code. | ||
| err = wait.PollUntilContextTimeout(context.Background(), 5*time.Second, 5*time.Minute, false, func(context context.Context) (bool, error) { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -53,8 +53,8 @@ var _ = g.Describe("[sig-network-edge][Conformance][Area:Networking][Feature:Rou | |
| g.Skip("Skip on platforms where the default router is not exposed by a load balancer service.") | ||
| } | ||
|
|
||
| defaultDomain, err := getDefaultIngressClusterDomainName(oc, time.Minute) | ||
| o.Expect(err).NotTo(o.HaveOccurred(), "failed to find default domain name") | ||
| baseDomain, err := getClusterBaseDomainName(oc, time.Minute) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't quite see the point of changing these domains - I get that using However, we can't use the CoreDNS Internal resolves anyways - we bypass it when we detect So I think this change adds complexity without solving a real problem. |
||
| o.Expect(err).NotTo(o.HaveOccurred(), "failed to find base domain name") | ||
|
|
||
| g.By("Locating the canary image reference") | ||
| image, err := getCanaryImage(oc) | ||
|
|
@@ -196,7 +196,7 @@ var _ = g.Describe("[sig-network-edge][Conformance][Area:Networking][Feature:Rou | |
| pemCrt2, err := certgen.MarshalCertToPEMString(tlsCrt2Data) | ||
| o.Expect(err).NotTo(o.HaveOccurred()) | ||
|
|
||
| shardFQDN := oc.Namespace() + "." + defaultDomain | ||
| shardFQDN := oc.Namespace() + "." + baseDomain | ||
|
|
||
| g.By("Creating routes to test for gRPC interoperability") | ||
| routeType := oc.Namespace() | ||
|
|
@@ -330,6 +330,15 @@ var _ = g.Describe("[sig-network-edge][Conformance][Area:Networking][Feature:Rou | |
| o.Expect(shardService).NotTo(o.BeNil()) | ||
| o.Expect(shardService.Status.LoadBalancer.Ingress).To(o.Not(o.BeEmpty())) | ||
|
|
||
| isDNSManaged, err := isDNSManaged(oc, time.Minute) | ||
| if err != nil { | ||
| e2e.Failf("Failed to get default ingresscontroller DNSManaged status: %v", err) | ||
| } | ||
| lbAddress := "" | ||
| if !isDNSManaged { | ||
| lbAddress = getLoadBalancerAddress(oc, "router-"+oc.Namespace()) | ||
| } | ||
|
|
||
| testCases := []string{ | ||
| "cancel_after_begin", | ||
| "cancel_after_first_response", | ||
|
|
@@ -352,15 +361,15 @@ var _ = g.Describe("[sig-network-edge][Conformance][Area:Networking][Feature:Rou | |
| routev1.TLSTerminationReencrypt, | ||
| routev1.TLSTerminationPassthrough, | ||
| } { | ||
| err := grpcExecTestCases(oc, routeType, 5*time.Minute, testCases...) | ||
| err := grpcExecTestCases(oc, routeType, 5*time.Minute, lbAddress, testCases...) | ||
| o.Expect(err).NotTo(o.HaveOccurred()) | ||
| } | ||
| }) | ||
| }) | ||
| }) | ||
|
|
||
| // grpcExecTestCases run gRPC interop test cases. | ||
| func grpcExecTestCases(oc *exutil.CLI, routeType routev1.TLSTerminationType, timeout time.Duration, testCases ...string) error { | ||
| func grpcExecTestCases(oc *exutil.CLI, routeType routev1.TLSTerminationType, timeout time.Duration, lbAddress string, testCases ...string) error { | ||
| host, err := getHostnameForRoute(oc, fmt.Sprintf("grpc-interop-%s", routeType)) | ||
| if err != nil { | ||
| return err | ||
|
|
@@ -371,6 +380,7 @@ func grpcExecTestCases(oc *exutil.CLI, routeType routev1.TLSTerminationType, tim | |
| Port: 443, | ||
| UseTLS: true, | ||
| Insecure: true, | ||
| Target: lbAddress, | ||
| } | ||
|
|
||
| if routeType == "h2c" { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we are mixing concepts for DNSManagement here.
Previously
isDNSManagedchecked whether the zones existed in the dnsConfig - it was probably a mistake to call itisDNSManaged- should have beenisDNSSupportedorisDNSZoneConfiguredmore generically.With this PR,
isDNSManagedis checking thedefaultIngressController for theDNSManagedcondition being set toFalse.The problem is that Gateway API has its own distinct API, configuration, and reconciliation logic, separate from IngressControllers. So, though it may work by coincidence, it feels a bit fragile and might bite us on future test updates.
Consider a scenario where another test has modified the
defaultIngressController while the cluster has DNS zones configured (PublicZone/PrivateZone are set). For example:HostNetworkorNodePortspec.endpointPublishingStrategy.loadBalancer.dnsManagementPolicy: UnmanagedIn either case, the default IngressController's
DNSManagedcondition would be False. However, Gateway API would still be creating and publishing DNSRecords successfully, since zones exist and the domain matches. The test would incorrectly skip DNS validation or bypass DNS resolution by dialing the LB IP directly.My suggestion: For the Gateway API specific tests - I think we should keep the same logic from the old
isDNSManaged(check dns.config.openshift.io/cluster PublicZone/PrivateZone), but rename it toisDNSZoneConfiguredto avoid naming confusion over theDNSManagedIngressController condition.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah, that definitely makes sense.