Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions distribution/docker/peon.sh
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,13 @@ fi
# If TASK_JSON is not set, CliPeon will pull the task.json file from deep storage.
mkdir -p ${TASK_DIR}; [ -n "$TASK_JSON" ] && echo ${TASK_JSON} | base64 -d | gzip -d > ${TASK_DIR}/task.json;

# Combine options from jvm.config and those given as JAVA_OPTS
# If a value is specified in both then JAVA_OPTS will take precedence when using OpenJDK
# However this behavior is not part of the spec and is thus implementation specific
if [ -f "$SERVICE_CONF_DIR/jvm.config" ]; then
JAVA_OPTS="$(cat $SERVICE_CONF_DIR/jvm.config | xargs) $JAVA_OPTS"
fi

# Start peon using CliPeon, with variables `Main internal peon TASK_DIR ATTEMPT_ID`
if [ -n "$TASK_ID" ]; then
# TASK_ID is only set from PodTemplateTaskAdapter
Expand Down
2 changes: 2 additions & 0 deletions docs/development/extensions-core/k8s-jobs.md
Original file line number Diff line number Diff line change
Expand Up @@ -483,6 +483,8 @@ template:

Any runtime property or JVM config used by the peon process can also be passed. E.G. below is an example of a ConfigMap that can be used to generate the `nodetype-config-volume` mount in the above template.

The peon container startup script (`peon.sh`) reads `jvm.config` from the directory mounted as `nodetype-config-volume` and prepends its contents to the peon's `JAVA_OPTS` before launching the JVM. If an option is set both in `jvm.config` and in `JAVA_OPTS` (for example via `druid.indexer.runner.javaOptsArray` when using the `overlordSingleContainer` or `overlordMultiContainer` adapters, which inject `JAVA_OPTS` as a container environment variable), the `JAVA_OPTS` value takes precedence under OpenJDK.

<details>
<summary>Example ConfigMap</summary>

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,13 +32,23 @@
*/
abstract class BaseKubernetesTaskRunnerDockerTest extends IngestionSmokeTest implements LatestImageDockerTest
{
protected static final String MANIFEST_TEMPLATE = "manifests/druid-service-with-operator.yaml";
private static final String DEFAULT_MANIFEST_TEMPLATE = "manifests/druid-service-with-operator.yaml";

protected K3sClusterResource k3sCluster;

/**
* Subclasses override to enable/disable SharedInformer caching.
*/
protected abstract boolean useSharedInformers();

/**
* Subclasses override to swap in a different operator manifest template.
*/
protected String getManifestTemplate()
{
return DEFAULT_MANIFEST_TEMPLATE;
}

@Override
protected EmbeddedDruidCluster addServers(EmbeddedDruidCluster cluster)
{
Expand All @@ -54,9 +64,9 @@ protected EmbeddedDruidCluster addServers(EmbeddedDruidCluster cluster)
.addProperty("druid.indexer.runner.k8sSharedInformerResyncPeriod", "PT1s")
.usingPort(30090);

final K3sClusterResource k3sCluster = new K3sClusterWithOperatorResource()
this.k3sCluster = new K3sClusterWithOperatorResource()
.usingDruidTestImage()
.usingDruidManifestTemplate(MANIFEST_TEMPLATE)
.usingDruidManifestTemplate(getManifestTemplate())
.addService(new K3sDruidService(DruidCommand.Server.COORDINATOR).usingPort(30081))
.addService(overlordService)
.addService(new K3sDruidService(DruidCommand.Server.HISTORICAL).usingPort(30083))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,15 @@ protected void waitUntilPodsAreReady(String namespace)
client.pods().inNamespace(namespace).resources().forEach(this::waitUntilPodIsReady);
}

/**
* Exposes the fabric8 {@link KubernetesClient} for tests that need to interact
* with the cluster directly (e.g. discover task-launched peon pods).
*/
public KubernetesClient getKubernetesClient()
{
return client;
}

@Override
public void stop()
{
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,202 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.apache.druid.testing.embedded.k8s;

import com.fasterxml.jackson.core.type.TypeReference;
import com.fasterxml.jackson.databind.ObjectMapper;
import io.fabric8.kubernetes.api.model.OwnerReference;
import io.fabric8.kubernetes.api.model.Pod;
import io.fabric8.kubernetes.client.KubernetesClient;
import io.fabric8.kubernetes.client.LocalPortForward;
import org.apache.druid.common.utils.IdUtils;
import org.apache.druid.indexing.common.task.NoopTask;
import org.apache.druid.query.DruidMetrics;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;

import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.URI;
import java.net.URL;
import java.util.List;
import java.util.Map;

/**
* Regression test for https://github.com/apache/druid/issues/18791.
*
* Verifies that {@code distribution/docker/peon.sh} sources options from
* {@code jvm.config} and that those options reach the peon JVM as system
* properties. Before the fix, {@code peon.sh} silently ignored
* {@code jvm.config}, so any JVM flags set there — including memory limits
* users had configured to prevent OOMs — never applied.
*
* <p>The test uses an operator manifest that injects
* {@code -Ddruid.test.peon.jvmconfig.marker=true} into the cluster-level
* {@code jvm.options}. The Druid operator writes these to {@code jvm.config}
* on each node, including the overlord. When a peon is launched via the
* {@code K8sTaskAdapter}, it inherits the overlord's pod spec (including the
* mounted {@code jvm.config}); {@code peon.sh} then sources that file and
* prepends its contents to {@code JAVA_OPTS}. The marker therefore appears in
* the peon JVM's system properties, which this test asserts by querying
* {@code /status/properties} on the peon pod.
*/
@Disabled("requires charts.datainfra.io chart, see https://github.com/apache/druid/pull/19047")
public class KubernetesPeonJvmConfigDockerTest extends BaseKubernetesTaskRunnerDockerTest
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Regression coverage is permanently disabled

The new regression test that verifies peon.sh sources jvm.config is annotated with @disabled, so the fix has no active automated coverage. Since this bug is specifically in container startup behavior, leaving the only regression test disabled means the same breakage can return without CI noticing. Please either make this test runnable in the existing embedded test setup or add a narrower active test around the script/config generation path.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just pushed something to cover this

{
private static final String MARKER_KEY = "druid.test.peon.jvmconfig.marker";
private static final String MARKER_VALUE = "true";
private static final String MARKER_MANIFEST =
"manifests/druid-service-with-operator-peonjvmconfig.yaml";

/**
* Matches {@code DruidK8sConstants.PORT} but duplicated here to avoid
* pulling the whole {@code druid-kubernetes-overlord-extensions} module in
* as a test-scope dep just for one integer.
*/
private static final int PEON_HTTP_PORT = 8100;

private static final long PEON_POD_READY_TIMEOUT_MILLIS = 180_000L;
private static final long PROPERTIES_POLL_TIMEOUT_MILLIS = 60_000L;

private static final ObjectMapper MAPPER = new ObjectMapper();
private static final TypeReference<Map<String, String>> MAP_TYPE = new TypeReference<>() {};

@Override
protected boolean useSharedInformers()
{
return false;
}

@Override
protected String getManifestTemplate()
{
return MARKER_MANIFEST;
}

@Test
public void test_peonSourcesJvmConfigMarker() throws Exception
{
final String taskId = IdUtils.getRandomId();
// Keep the peon alive long enough to discover its pod, port-forward, and hit the status endpoint.
final long runDurationMillis = 240_000L;

cluster.callApi().onLeaderOverlord(
o -> o.runTask(
taskId,
new NoopTask(taskId, null, dataSource, runDurationMillis, 0L, null)
)
);

try {
eventCollector.latchableEmitter().waitForEvent(
event -> event.hasMetricName(NoopTask.EVENT_STARTED)
.hasDimension(DruidMetrics.TASK_ID, taskId)
);

final KubernetesClient client = k3sCluster.getKubernetesClient();
final Pod peonPod = waitForReadyPeonPod(client);

try (LocalPortForward portForward = client.pods()
.inNamespace(K3sClusterResource.DRUID_NAMESPACE)
.withName(peonPod.getMetadata().getName())
.portForward(PEON_HTTP_PORT)) {
final Map<String, String> peonProperties = pollForStatusProperties(portForward.getLocalPort());
Assertions.assertEquals(
MARKER_VALUE,
peonProperties.get(MARKER_KEY),
"Expected jvm.config marker to reach peon JVM as a system property. "
+ "This is a regression: peon.sh must source $SERVICE_CONF_DIR/jvm.config."
);
}
}
finally {
try {
cluster.callApi().onLeaderOverlord(o -> o.cancelTask(taskId));
}
catch (Exception ignore) {
// Best-effort cleanup.
}
}
}

private Pod waitForReadyPeonPod(KubernetesClient client) throws InterruptedException
{
final long deadline = System.currentTimeMillis() + PEON_POD_READY_TIMEOUT_MILLIS;
while (System.currentTimeMillis() < deadline) {
for (Pod pod : client.pods().inNamespace(K3sClusterResource.DRUID_NAMESPACE).list().getItems()) {
if (!ownedByJob(pod)) {
continue;
}
if (isReady(pod)) {
return pod;
}
}
Thread.sleep(2_000L);
}
throw new AssertionError(
"No Job-owned pod became Ready within "
+ (PEON_POD_READY_TIMEOUT_MILLIS / 1000L) + "s — expected a peon pod to appear"
);
}

private static boolean ownedByJob(Pod pod)
{
final List<OwnerReference> owners = pod.getMetadata().getOwnerReferences();
return owners != null && owners.stream().anyMatch(o -> "Job".equals(o.getKind()));
}

private static boolean isReady(Pod pod)
{
return pod.getStatus() != null
&& pod.getStatus().getConditions() != null
&& pod.getStatus().getConditions().stream().anyMatch(
c -> "Ready".equals(c.getType()) && "True".equals(c.getStatus())
);
}

private Map<String, String> pollForStatusProperties(int localPort) throws InterruptedException
{
final long deadline = System.currentTimeMillis() + PROPERTIES_POLL_TIMEOUT_MILLIS;
Exception lastException = null;
while (System.currentTimeMillis() < deadline) {
try {
final URL url = URI.create("http://localhost:" + localPort + "/status/properties").toURL();
final HttpURLConnection conn = (HttpURLConnection) url.openConnection();
conn.setConnectTimeout(2_000);
conn.setReadTimeout(2_000);
if (conn.getResponseCode() == 200) {
try (InputStream is = conn.getInputStream()) {
return MAPPER.readValue(is, MAP_TYPE);
}
}
}
catch (Exception e) {
lastException = e;
}
Thread.sleep(1_000L);
}
final String suffix = lastException == null ? "" : " Last error: " + lastException;
throw new AssertionError(
"Peon /status/properties did not return 200 within "
+ (PROPERTIES_POLL_TIMEOUT_MILLIS / 1000L) + "s." + suffix
);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
apiVersion: "druid.apache.org/v1alpha1"
kind: "Druid"
metadata:
name: test-cluster-${service}
spec:
image: ${image}
startScript: /druid.sh
scalePvcSts: true
rollingDeploy: true
defaultProbes: false
podLabels:
environment: stage
release: alpha
podAnnotations:
dummy: k8s_extn_needs_atleast_one_annotation
volumes:
- name: mysqlconnector
emptyDir: { }
securityContext:
fsGroup: 0
runAsUser: 0
runAsGroup: 0
containerSecurityContext:
privileged: true
commonConfigMountPath: "/opt/druid/conf/druid/cluster/_common"
common.runtime.properties: |
${commonRuntimeProperties}
jvm.options: |-
-server
-Djava.net.preferIPv4Stack=true
-XX:MaxDirectMemorySize=10240g
-Duser.timezone=UTC
-Dfile.encoding=UTF-8
-Dlog4j.debug
-Djava.util.logging.manager=org.apache.logging.log4j.jul.LogManager
-Ddruid.test.peon.jvmconfig.marker=true
log4j.config: |-
<?xml version="1.0" encoding="UTF-8" ?>
<Configuration status="WARN">
<Appenders>
<Console name="Console" target="SYSTEM_OUT">
<PatternLayout pattern="%d{HH:mm:ss.SSS} [%t] %-5level %logger{36} - %msg%n"/>
</Console>
<File name="FileAppender" fileName="log/${sys:druid.node.type}.log">
<PatternLayout pattern="%d{yyyy-MM-dd HH:mm:ss.SSS} [%t] %-5level %logger{36} - %msg%n"/>
</File>
</Appenders>
<Loggers>
<Root level="info">
<AppenderRef ref="Console"/>
<AppenderRef ref="FileAppender"/>
</Root>
</Loggers>
</Configuration>
env:
- name: POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: POD_NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
nodes:
${service}s:
nodeType: ${service}
priorityClassName: system-cluster-critical
druid.port: ${port}
services:
- spec:
type: NodePort
ports:
- name: http
port: ${port}
targetPort: ${port}
nodePort: ${port}
replicas: 1
nodeConfigMountPath: "/opt/druid/conf/druid/cluster/${serviceFolder}"
runtime.properties: |
${nodeRuntimeProperties}
livenessProbe:
failureThreshold: 10
httpGet:
path: /status/health
port: ${port}
initialDelaySeconds: 5
periodSeconds: 10
successThreshold: 1
timeoutSeconds: 5
readinessProbe:
failureThreshold: 20
httpGet:
path: /status/health
port: ${port}
initialDelaySeconds: 5
periodSeconds: 10
successThreshold: 1
timeoutSeconds: 5
startUpProbe:
failureThreshold: 20
httpGet:
path: /status/health
port: ${port}
initialDelaySeconds: 60
periodSeconds: 30
successThreshold: 1
timeoutSeconds: 10
volumeMounts:
- mountPath: /druid/data
name: druid-shared-storage
volumes:
- name: druid-shared-storage
hostPath:
path: /druid/shared-storage
type: DirectoryOrCreate
Loading