From 2840051b9bdec17afe135b312de7b7d5d347270f Mon Sep 17 00:00:00 2001
From: Abhishek Choudhary <shreemaan.abhishek@gmail.com>
Date: Thu, 11 Jun 2026 17:53:14 +0800
Subject: [PATCH] fix(ai-aws-content-moderation): moderate decoded LLM content
 instead of raw body

The plugin sent the raw HTTP request body to AWS Comprehend, so it scored
the undecoded JSON envelope (e.g. the "messages" wrapper and escape
sequences like a literal backslash-u sequence) instead of the actual
prompt the upstream LLM acts on. This makes the moderation see different
text than the model and adds noise to the toxicity result.

Make the plugin protocol-aware, like ai-aliyun-content-moderation: require
application/json, parse the body, detect the client protocol via
ai-protocols, and send only the normalized, decoded LLM-visible content
to Comprehend. Non-AI requests (non-JSON, unparseable, or bodies that
carry no LLM content) are governed by the existing fail_mode.
---
 apisix/plugins/ai-aws-content-moderation.lua  |  39 +++++-
 .../plugins/ai-aws-content-moderation.md      |   2 +-
 t/plugin/ai-aws-content-moderation.t          | 121 ++++++++++++++++--
 t/plugin/ai-aws-content-moderation2.t         |   4 +-
 4 files changed, 152 insertions(+), 14 deletions(-)

diff --git a/apisix/plugins/ai-aws-content-moderation.lua b/apisix/plugins/ai-aws-content-moderation.lua
index a3a1295c6870..8d8e3739e820 100644
--- a/apisix/plugins/ai-aws-content-moderation.lua
+++ b/apisix/plugins/ai-aws-content-moderation.lua
@@ -18,6 +18,7 @@ require("resty.aws.config") -- to read env vars before initing aws module
 
 local core = require("apisix.core")
 local binding = require("apisix.plugins.ai-protocols.binding")
+local protocols = require("apisix.plugins.ai-protocols")
 local aws = require("resty.aws")
 local aws_instance
 
@@ -107,9 +108,41 @@ function _M.rewrite(conf, ctx)
         return
     end
 
-    local body, err = core.request.get_body()
+    local body, err = core.request.get_json_request_body_table()
     if not body then
-        return HTTP_BAD_REQUEST, err
+        local msg = type(err) == "table" and err.message or err
+        local handled, code, resp = binding.on_unsupported(
+            conf.fail_mode, _M.name, ctx,
+            "failed to parse request body: " .. (msg or "invalid JSON"),
+            HTTP_BAD_REQUEST, err)
+        if handled then
+            return code, resp
+        end
+        return
+    end
+
+    -- The plugin runs before ai-proxy, so detect the client protocol here rather
+    -- than relying on ctx.ai_client_protocol. "passthrough" is the catch-all for
+    -- non-AI bodies, which carry no LLM content to moderate.
+    local protocol_name = protocols.detect(body, ctx)
+    local proto = protocol_name and protocols.get(protocol_name)
+    if not proto or protocol_name == "passthrough" or not proto.extract_request_content then
+        local handled, code, resp = binding.on_unsupported(
+            conf.fail_mode, _M.name, ctx,
+            "no supported AI protocol for the request",
+            HTTP_BAD_REQUEST, "no supported AI protocol for the request")
+        if handled then
+            return code, resp
+        end
+        return
+    end
+
+    -- moderate the decoded LLM-visible content, not the raw JSON envelope
+    local contents = proto.extract_request_content(body)
+    local text = core.table.concat(contents, " ")
+    if text == "" then
+        -- no LLM-visible content to moderate
+        return
     end
 
     local comprehend = conf.comprehend
@@ -139,7 +172,7 @@ function _M.rewrite(conf, ctx)
     local res, err = comprehend:detectToxicContent({
         LanguageCode = "en",
         TextSegments = {{
-            Text = body
+            Text = text
         }},
     })
 
diff --git a/docs/en/latest/plugins/ai-aws-content-moderation.md b/docs/en/latest/plugins/ai-aws-content-moderation.md
index fce2755a3239..239475ff986c 100644
--- a/docs/en/latest/plugins/ai-aws-content-moderation.md
+++ b/docs/en/latest/plugins/ai-aws-content-moderation.md
@@ -38,7 +38,7 @@ import TabItem from '@theme/TabItem';
 
 The `ai-aws-content-moderation` Plugin integrates with [AWS Comprehend](https://aws.amazon.com/comprehend/) to check request bodies for toxicity when proxying to LLMs, such as profanity, hate speech, insult, harassment, violence, and more, rejecting requests if the evaluated outcome exceeds the configured threshold.
 
-This Plugin must be used in Routes that proxy requests to LLMs only.
+This Plugin must be used in Routes that proxy requests to LLMs only. The Plugin parses the `application/json` request body and sends only the decoded LLM-visible content (for example `messages[].content`) to AWS Comprehend, rather than the raw request body. Requests that are not recognized AI requests (non-JSON bodies, or JSON that carries no LLM content) are handled according to `fail_mode`.
 
 ## Plugin Attributes
 
diff --git a/t/plugin/ai-aws-content-moderation.t b/t/plugin/ai-aws-content-moderation.t
index 765bba1ab567..0bdd8ebbbfe9 100644
--- a/t/plugin/ai-aws-content-moderation.t
+++ b/t/plugin/ai-aws-content-moderation.t
@@ -139,7 +139,9 @@ passed
 === TEST 2: toxic request should fail
 --- request
 POST /echo
-toxic
+{"messages":[{"role":"user","content":"toxic"}]}
+--- more_headers
+Content-Type: application/json
 --- error_code: 400
 --- response_body chomp
 request body exceeds toxicity threshold
@@ -149,7 +151,9 @@ request body exceeds toxicity threshold
 === TEST 3: good request should pass
 --- request
 POST /echo
-good_request
+{"messages":[{"role":"user","content":"good_request"}]}
+--- more_headers
+Content-Type: application/json
 --- error_code: 200
 
 
@@ -199,7 +203,9 @@ passed
 === TEST 5: profane request should fail
 --- request
 POST /echo
-profane
+{"messages":[{"role":"user","content":"profane"}]}
+--- more_headers
+Content-Type: application/json
 --- error_code: 400
 --- response_body chomp
 request body exceeds PROFANITY threshold
@@ -209,7 +215,9 @@ request body exceeds PROFANITY threshold
 === TEST 6: very profane request should also fail
 --- request
 POST /echo
-very_profane
+{"messages":[{"role":"user","content":"very_profane"}]}
+--- more_headers
+Content-Type: application/json
 --- error_code: 400
 --- response_body chomp
 request body exceeds PROFANITY threshold
@@ -219,7 +227,9 @@ request body exceeds PROFANITY threshold
 === TEST 7: good_request should pass
 --- request
 POST /echo
-good_request
+{"messages":[{"role":"user","content":"good_request"}]}
+--- more_headers
+Content-Type: application/json
 --- error_code: 200
 
 
@@ -269,7 +279,9 @@ passed
 === TEST 9: profane request should pass profanity check but fail toxicity check
 --- request
 POST /echo
-profane
+{"messages":[{"role":"user","content":"profane"}]}
+--- more_headers
+Content-Type: application/json
 --- error_code: 400
 --- response_body chomp
 request body exceeds toxicity threshold
@@ -279,7 +291,9 @@ request body exceeds toxicity threshold
 === TEST 10: profane_but_not_toxic request should pass
 --- request
 POST /echo
-profane_but_not_toxic
+{"messages":[{"role":"user","content":"profane_but_not_toxic"}]}
+--- more_headers
+Content-Type: application/json
 --- error_code: 200
 
 
@@ -287,7 +301,9 @@ profane_but_not_toxic
 === TEST 11: but very profane request will fail
 --- request
 POST /echo
-very_profane
+{"messages":[{"role":"user","content":"very_profane"}]}
+--- more_headers
+Content-Type: application/json
 --- error_code: 400
 --- response_body chomp
 request body exceeds PROFANITY threshold
@@ -297,7 +313,9 @@ request body exceeds PROFANITY threshold
 === TEST 12: good_request should pass
 --- request
 POST /echo
-good_request
+{"messages":[{"role":"user","content":"good_request"}]}
+--- more_headers
+Content-Type: application/json
 --- error_code: 200
 
 
@@ -402,3 +420,88 @@ Content-Type: multipart/form-data
 --- error_code: 400
 --- response_body eval
 qr/only application\/json is supported/
+
+
+
+=== TEST 17: only the decoded LLM content is moderated, not the raw JSON envelope
+--- request
+POST /echo
+{"model":"gpt-4","messages":[{"role":"user","content":"toxic"}]}
+--- more_headers
+Content-Type: application/json
+--- error_code: 400
+--- response_body chomp
+request body exceeds toxicity threshold
+
+
+
+=== TEST 18: non-AI JSON body is rejected when fail_mode=error
+--- request
+POST /echo
+{"foo":"bar"}
+--- more_headers
+Content-Type: application/json
+--- error_code: 400
+--- response_body eval
+qr/no supported AI protocol for the request/
+
+
+
+=== TEST 19: malformed JSON body is rejected when fail_mode=error
+--- request
+POST /echo
+not-json
+--- more_headers
+Content-Type: application/json
+--- error_code: 400
+
+
+
+=== TEST 20: setup route with default fail_mode (skip)
+--- config
+    location /t {
+        content_by_lua_block {
+            local t = require("lib.test_admin").test
+            local code, body = t('/apisix/admin/routes/1',
+                ngx.HTTP_PUT,
+                [[{
+                    "uri": "/echo",
+                    "plugins": {
+                        "ai-aws-content-moderation": {
+                            "comprehend": {
+                                "access_key_id": "access",
+                                "secret_access_key": "ea+secret",
+                                "region": "us-east-1",
+                                "endpoint": "http://localhost:2668"
+                            }
+                        }
+                    },
+                    "upstream": {
+                        "type": "roundrobin",
+                        "nodes": {
+                            "127.0.0.1:1980": 1
+                        }
+                    }
+                }]]
+            )
+
+            if code >= 300 then
+                ngx.status = code
+            end
+            ngx.say(body)
+        }
+    }
+--- response_body
+passed
+
+
+
+=== TEST 21: non-AI JSON body passes through unchecked when fail_mode=skip
+--- request
+POST /echo
+{"foo":"bar"}
+--- more_headers
+Content-Type: application/json
+--- error_code: 200
+--- response_body chomp
+{"foo":"bar"}
diff --git a/t/plugin/ai-aws-content-moderation2.t b/t/plugin/ai-aws-content-moderation2.t
index 869fcf09d124..fbc4db583db1 100644
--- a/t/plugin/ai-aws-content-moderation2.t
+++ b/t/plugin/ai-aws-content-moderation2.t
@@ -84,7 +84,9 @@ passed
 === TEST 2: request should fail
 --- request
 POST /echo
-toxic
+{"messages":[{"role":"user","content":"toxic"}]}
+--- more_headers
+Content-Type: application/json
 --- error_code: 500
 --- response_body chomp
 Comprehend:detectToxicContent() failed to connect to 'http://localhost:2668': connection refused