diff --git a/hertzbeat-collector/hertzbeat-collector-basic/src/main/java/org/apache/hertzbeat/collector/collect/http/HttpCollectImpl.java b/hertzbeat-collector/hertzbeat-collector-basic/src/main/java/org/apache/hertzbeat/collector/collect/http/HttpCollectImpl.java index 03e2d0dc4c5..ab9f2dca919 100644 --- a/hertzbeat-collector/hertzbeat-collector-basic/src/main/java/org/apache/hertzbeat/collector/collect/http/HttpCollectImpl.java +++ b/hertzbeat-collector/hertzbeat-collector-basic/src/main/java/org/apache/hertzbeat/collector/collect/http/HttpCollectImpl.java @@ -30,6 +30,7 @@ import java.io.StringReader; import java.net.ConnectException; import java.net.UnknownHostException; +import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import java.util.Arrays; import java.util.LinkedList; @@ -82,6 +83,7 @@ import org.apache.http.client.methods.HttpUriRequest; import org.apache.http.client.methods.RequestBuilder; import org.apache.http.client.protocol.HttpClientContext; +import org.apache.http.entity.ContentType; import org.apache.http.entity.StringEntity; import org.apache.http.impl.client.BasicCredentialsProvider; import org.apache.http.protocol.HttpContext; @@ -114,6 +116,10 @@ public class HttpCollectImpl extends AbstractCollect { */ private static final List DANGEROUS_XPATH_PATTERNS; + private static final Pattern XML_ENCODING_PATTERN = Pattern.compile( + "<\\?xml\\s+[^>]*encoding\\s*=\\s*[\"']([^\"']+)[\"']", + Pattern.CASE_INSENSITIVE); + static { List patterns = new ArrayList<>(); for (String pattern : CollectorConstants.DANGEROUS_XPATH_PATTERNS) { @@ -182,7 +188,7 @@ public void collect(CollectRep.MetricsData.Builder builder, Metrics metrics) { Option 1: Parse using InputStream, but this requires significant code changes; Option 2: Manually trigger garbage collection, similar to how it's done in Dubbo for large inputs. */ - String resp = entity == null ? "" : EntityUtils.toString(entity, StandardCharsets.UTF_8); + String resp = entity == null ? "" : readEntityAsString(entity, isXmlParseType(parseType)); if (!StringUtils.hasText(resp)) { log.info("http response entity is empty, status: {}.", statusCode); } @@ -240,6 +246,37 @@ public void collect(CollectRep.MetricsData.Builder builder, Metrics metrics) { } } + private boolean isXmlParseType(String parseType) { + return DispatchConstants.PARSE_XML_PATH.equals(parseType) || DispatchConstants.PARSE_SITE_MAP.equals(parseType); + } + + private String readEntityAsString(HttpEntity entity, boolean xmlAware) throws IOException { + if (!xmlAware) { + return EntityUtils.toString(entity, StandardCharsets.UTF_8); + } + byte[] content = EntityUtils.toByteArray(entity); + ContentType contentType = ContentType.get(entity); + Charset charset = contentType == null ? null : contentType.getCharset(); + if (charset == null) { + charset = detectXmlCharset(content); + } + return new String(content, charset); + } + + private Charset detectXmlCharset(byte[] content) { + int previewLength = Math.min(content.length, 256); + String preview = new String(content, 0, previewLength, StandardCharsets.US_ASCII); + Matcher matcher = XML_ENCODING_PATTERN.matcher(preview); + if (matcher.find()) { + try { + return Charset.forName(matcher.group(1)); + } catch (IllegalArgumentException ignored) { + return StandardCharsets.UTF_8; + } + } + return StandardCharsets.UTF_8; + } + private void parseResponseByHeader(CollectRep.MetricsData.Builder builder, List aliases, CloseableHttpResponse response) { CollectRep.ValueRow.Builder valueRowBuilder = CollectRep.ValueRow.newBuilder(); for (String alias : aliases) { diff --git a/hertzbeat-collector/hertzbeat-collector-basic/src/test/java/org/apache/hertzbeat/collector/collect/http/HttpCollectImplTest.java b/hertzbeat-collector/hertzbeat-collector-basic/src/test/java/org/apache/hertzbeat/collector/collect/http/HttpCollectImplTest.java index 5ea4596da26..05199a2feb7 100644 --- a/hertzbeat-collector/hertzbeat-collector-basic/src/test/java/org/apache/hertzbeat/collector/collect/http/HttpCollectImplTest.java +++ b/hertzbeat-collector/hertzbeat-collector-basic/src/test/java/org/apache/hertzbeat/collector/collect/http/HttpCollectImplTest.java @@ -18,6 +18,7 @@ package org.apache.hertzbeat.collector.collect.http; import com.google.common.collect.Lists; +import com.sun.net.httpserver.HttpServer; import org.apache.hertzbeat.collector.dispatch.DispatchConstants; import org.apache.hertzbeat.common.entity.job.Metrics; import org.apache.hertzbeat.common.entity.job.protocol.HttpProtocol; @@ -28,6 +29,7 @@ import java.io.ByteArrayInputStream; import java.io.InputStream; import java.lang.reflect.Method; +import java.net.InetSocketAddress; import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Arrays; @@ -190,6 +192,94 @@ public CollectRep.MetricsData.Builder addValueRow(CollectRep.ValueRow valueRow) assertEquals("0", secondRow.getColumns(3), "Second server memory should be 0"); } + @Test + void parseResponseByXmlPathHonorsXmlEncodingDeclaration() throws Exception { + String xmlResponse = """ + + + + café + + + """; + byte[] responseBytes = xmlResponse.getBytes(StandardCharsets.ISO_8859_1); + HttpServer server = HttpServer.create(new InetSocketAddress("127.0.0.1", 0), 0); + server.createContext("/metrics", exchange -> { + exchange.getResponseHeaders().add("Content-Type", "application/xml"); + exchange.sendResponseHeaders(200, responseBytes.length); + exchange.getResponseBody().write(responseBytes); + exchange.close(); + }); + server.start(); + + try { + HttpProtocol http = HttpProtocol.builder() + .method("GET") + .host("127.0.0.1") + .port(String.valueOf(server.getAddress().getPort())) + .url("/metrics") + .parseType(DispatchConstants.PARSE_XML_PATH) + .parseScript("//item") + .build(); + Metrics metrics = Metrics.builder() + .http(http) + .aliasFields(Lists.newArrayList("name")) + .build(); + CollectRep.MetricsData.Builder builder = CollectRep.MetricsData.newBuilder(); + + httpCollectImpl.collect(builder, metrics); + + assertEquals(1, builder.getValuesCount()); + assertEquals("café", builder.getValues(0).getColumns(0)); + } finally { + server.stop(0); + } + } + + @Test + void parseResponseByXmlPathFallsBackToUtf8ForUnsupportedXmlEncoding() throws Exception { + String xmlResponse = """ + + + + service + + + """; + byte[] responseBytes = xmlResponse.getBytes(StandardCharsets.UTF_8); + HttpServer server = HttpServer.create(new InetSocketAddress("127.0.0.1", 0), 0); + server.createContext("/metrics", exchange -> { + exchange.getResponseHeaders().add("Content-Type", "application/xml"); + exchange.sendResponseHeaders(200, responseBytes.length); + exchange.getResponseBody().write(responseBytes); + exchange.close(); + }); + server.start(); + + try { + HttpProtocol http = HttpProtocol.builder() + .method("GET") + .host("127.0.0.1") + .port(String.valueOf(server.getAddress().getPort())) + .url("/metrics") + .parseType(DispatchConstants.PARSE_XML_PATH) + .parseScript("//item") + .build(); + Metrics metrics = Metrics.builder() + .http(http) + .aliasFields(Lists.newArrayList("name")) + .build(); + CollectRep.MetricsData.Builder builder = CollectRep.MetricsData.newBuilder(); + + httpCollectImpl.collect(builder, metrics); + + assertEquals(1, builder.getValuesCount()); + assertEquals("service", builder.getValues(0).getColumns(0)); + } finally { + server.stop(0); + } + } + @Test void parseResponseByJsonPath() throws Exception { String jsonResponse = "{"