From 2840051b9bdec17afe135b312de7b7d5d347270f Mon Sep 17 00:00:00 2001 From: Abhishek Choudhary Date: Thu, 11 Jun 2026 17:53:14 +0800 Subject: [PATCH] fix(ai-aws-content-moderation): moderate decoded LLM content instead of raw body The plugin sent the raw HTTP request body to AWS Comprehend, so it scored the undecoded JSON envelope (e.g. the "messages" wrapper and escape sequences like a literal backslash-u sequence) instead of the actual prompt the upstream LLM acts on. This makes the moderation see different text than the model and adds noise to the toxicity result. Make the plugin protocol-aware, like ai-aliyun-content-moderation: require application/json, parse the body, detect the client protocol via ai-protocols, and send only the normalized, decoded LLM-visible content to Comprehend. Non-AI requests (non-JSON, unparseable, or bodies that carry no LLM content) are governed by the existing fail_mode. --- apisix/plugins/ai-aws-content-moderation.lua | 39 +++++- .../plugins/ai-aws-content-moderation.md | 2 +- t/plugin/ai-aws-content-moderation.t | 121 ++++++++++++++++-- t/plugin/ai-aws-content-moderation2.t | 4 +- 4 files changed, 152 insertions(+), 14 deletions(-) diff --git a/apisix/plugins/ai-aws-content-moderation.lua b/apisix/plugins/ai-aws-content-moderation.lua index a3a1295c6870..8d8e3739e820 100644 --- a/apisix/plugins/ai-aws-content-moderation.lua +++ b/apisix/plugins/ai-aws-content-moderation.lua @@ -18,6 +18,7 @@ require("resty.aws.config") -- to read env vars before initing aws module local core = require("apisix.core") local binding = require("apisix.plugins.ai-protocols.binding") +local protocols = require("apisix.plugins.ai-protocols") local aws = require("resty.aws") local aws_instance @@ -107,9 +108,41 @@ function _M.rewrite(conf, ctx) return end - local body, err = core.request.get_body() + local body, err = core.request.get_json_request_body_table() if not body then - return HTTP_BAD_REQUEST, err + local msg = type(err) == "table" and err.message or err + local handled, code, resp = binding.on_unsupported( + conf.fail_mode, _M.name, ctx, + "failed to parse request body: " .. (msg or "invalid JSON"), + HTTP_BAD_REQUEST, err) + if handled then + return code, resp + end + return + end + + -- The plugin runs before ai-proxy, so detect the client protocol here rather + -- than relying on ctx.ai_client_protocol. "passthrough" is the catch-all for + -- non-AI bodies, which carry no LLM content to moderate. + local protocol_name = protocols.detect(body, ctx) + local proto = protocol_name and protocols.get(protocol_name) + if not proto or protocol_name == "passthrough" or not proto.extract_request_content then + local handled, code, resp = binding.on_unsupported( + conf.fail_mode, _M.name, ctx, + "no supported AI protocol for the request", + HTTP_BAD_REQUEST, "no supported AI protocol for the request") + if handled then + return code, resp + end + return + end + + -- moderate the decoded LLM-visible content, not the raw JSON envelope + local contents = proto.extract_request_content(body) + local text = core.table.concat(contents, " ") + if text == "" then + -- no LLM-visible content to moderate + return end local comprehend = conf.comprehend @@ -139,7 +172,7 @@ function _M.rewrite(conf, ctx) local res, err = comprehend:detectToxicContent({ LanguageCode = "en", TextSegments = {{ - Text = body + Text = text }}, }) diff --git a/docs/en/latest/plugins/ai-aws-content-moderation.md b/docs/en/latest/plugins/ai-aws-content-moderation.md index fce2755a3239..239475ff986c 100644 --- a/docs/en/latest/plugins/ai-aws-content-moderation.md +++ b/docs/en/latest/plugins/ai-aws-content-moderation.md @@ -38,7 +38,7 @@ import TabItem from '@theme/TabItem'; The `ai-aws-content-moderation` Plugin integrates with [AWS Comprehend](https://aws.amazon.com/comprehend/) to check request bodies for toxicity when proxying to LLMs, such as profanity, hate speech, insult, harassment, violence, and more, rejecting requests if the evaluated outcome exceeds the configured threshold. -This Plugin must be used in Routes that proxy requests to LLMs only. +This Plugin must be used in Routes that proxy requests to LLMs only. The Plugin parses the `application/json` request body and sends only the decoded LLM-visible content (for example `messages[].content`) to AWS Comprehend, rather than the raw request body. Requests that are not recognized AI requests (non-JSON bodies, or JSON that carries no LLM content) are handled according to `fail_mode`. ## Plugin Attributes diff --git a/t/plugin/ai-aws-content-moderation.t b/t/plugin/ai-aws-content-moderation.t index 765bba1ab567..0bdd8ebbbfe9 100644 --- a/t/plugin/ai-aws-content-moderation.t +++ b/t/plugin/ai-aws-content-moderation.t @@ -139,7 +139,9 @@ passed === TEST 2: toxic request should fail --- request POST /echo -toxic +{"messages":[{"role":"user","content":"toxic"}]} +--- more_headers +Content-Type: application/json --- error_code: 400 --- response_body chomp request body exceeds toxicity threshold @@ -149,7 +151,9 @@ request body exceeds toxicity threshold === TEST 3: good request should pass --- request POST /echo -good_request +{"messages":[{"role":"user","content":"good_request"}]} +--- more_headers +Content-Type: application/json --- error_code: 200 @@ -199,7 +203,9 @@ passed === TEST 5: profane request should fail --- request POST /echo -profane +{"messages":[{"role":"user","content":"profane"}]} +--- more_headers +Content-Type: application/json --- error_code: 400 --- response_body chomp request body exceeds PROFANITY threshold @@ -209,7 +215,9 @@ request body exceeds PROFANITY threshold === TEST 6: very profane request should also fail --- request POST /echo -very_profane +{"messages":[{"role":"user","content":"very_profane"}]} +--- more_headers +Content-Type: application/json --- error_code: 400 --- response_body chomp request body exceeds PROFANITY threshold @@ -219,7 +227,9 @@ request body exceeds PROFANITY threshold === TEST 7: good_request should pass --- request POST /echo -good_request +{"messages":[{"role":"user","content":"good_request"}]} +--- more_headers +Content-Type: application/json --- error_code: 200 @@ -269,7 +279,9 @@ passed === TEST 9: profane request should pass profanity check but fail toxicity check --- request POST /echo -profane +{"messages":[{"role":"user","content":"profane"}]} +--- more_headers +Content-Type: application/json --- error_code: 400 --- response_body chomp request body exceeds toxicity threshold @@ -279,7 +291,9 @@ request body exceeds toxicity threshold === TEST 10: profane_but_not_toxic request should pass --- request POST /echo -profane_but_not_toxic +{"messages":[{"role":"user","content":"profane_but_not_toxic"}]} +--- more_headers +Content-Type: application/json --- error_code: 200 @@ -287,7 +301,9 @@ profane_but_not_toxic === TEST 11: but very profane request will fail --- request POST /echo -very_profane +{"messages":[{"role":"user","content":"very_profane"}]} +--- more_headers +Content-Type: application/json --- error_code: 400 --- response_body chomp request body exceeds PROFANITY threshold @@ -297,7 +313,9 @@ request body exceeds PROFANITY threshold === TEST 12: good_request should pass --- request POST /echo -good_request +{"messages":[{"role":"user","content":"good_request"}]} +--- more_headers +Content-Type: application/json --- error_code: 200 @@ -402,3 +420,88 @@ Content-Type: multipart/form-data --- error_code: 400 --- response_body eval qr/only application\/json is supported/ + + + +=== TEST 17: only the decoded LLM content is moderated, not the raw JSON envelope +--- request +POST /echo +{"model":"gpt-4","messages":[{"role":"user","content":"toxic"}]} +--- more_headers +Content-Type: application/json +--- error_code: 400 +--- response_body chomp +request body exceeds toxicity threshold + + + +=== TEST 18: non-AI JSON body is rejected when fail_mode=error +--- request +POST /echo +{"foo":"bar"} +--- more_headers +Content-Type: application/json +--- error_code: 400 +--- response_body eval +qr/no supported AI protocol for the request/ + + + +=== TEST 19: malformed JSON body is rejected when fail_mode=error +--- request +POST /echo +not-json +--- more_headers +Content-Type: application/json +--- error_code: 400 + + + +=== TEST 20: setup route with default fail_mode (skip) +--- config + location /t { + content_by_lua_block { + local t = require("lib.test_admin").test + local code, body = t('/apisix/admin/routes/1', + ngx.HTTP_PUT, + [[{ + "uri": "/echo", + "plugins": { + "ai-aws-content-moderation": { + "comprehend": { + "access_key_id": "access", + "secret_access_key": "ea+secret", + "region": "us-east-1", + "endpoint": "http://localhost:2668" + } + } + }, + "upstream": { + "type": "roundrobin", + "nodes": { + "127.0.0.1:1980": 1 + } + } + }]] + ) + + if code >= 300 then + ngx.status = code + end + ngx.say(body) + } + } +--- response_body +passed + + + +=== TEST 21: non-AI JSON body passes through unchecked when fail_mode=skip +--- request +POST /echo +{"foo":"bar"} +--- more_headers +Content-Type: application/json +--- error_code: 200 +--- response_body chomp +{"foo":"bar"} diff --git a/t/plugin/ai-aws-content-moderation2.t b/t/plugin/ai-aws-content-moderation2.t index 869fcf09d124..fbc4db583db1 100644 --- a/t/plugin/ai-aws-content-moderation2.t +++ b/t/plugin/ai-aws-content-moderation2.t @@ -84,7 +84,9 @@ passed === TEST 2: request should fail --- request POST /echo -toxic +{"messages":[{"role":"user","content":"toxic"}]} +--- more_headers +Content-Type: application/json --- error_code: 500 --- response_body chomp Comprehend:detectToxicContent() failed to connect to 'http://localhost:2668': connection refused