awslabs · theagenticguy · Feb 26, 2026 · Feb 27, 2026 · Mar 3, 2026 · Mar 5, 2026
@@ -24,7 +24,12 @@
       ],
       "name": "deploy-on-aws",
       "source": "./plugins/deploy-on-aws",
-      "tags": ["aws", "deploy", "infrastructure", "cdk"],
+      "tags": [
+        "aws",
+        "deploy",
+        "infrastructure",
+        "cdk"
+      ],
       "version": "1.1.0"
     },
     {
@@ -45,7 +50,44 @@
       ],
       "name": "amazon-location-service",
       "source": "./plugins/amazon-location-service",
-      "tags": ["aws", "location", "maps", "geospatial"],
+      "tags": [
+        "aws",
+        "location",
+        "maps",
+        "geospatial"
+      ],
+      "version": "1.0.0"
+    },
+    {
+      "category": "observability",
+      "description": "Comprehensive AWS observability and FinOps platform combining CloudWatch Logs, Metrics, Alarms, Application Signals (APM), CloudTrail security auditing, Billing & Cost Management, and automated codebase observability gap analysis.",
+      "keywords": [
+        "aws",
+        "observability",
+        "cloudwatch",
+        "monitoring",
+        "logs",
+        "metrics",
+        "alarms",
+        "application-signals",
+        "apm",
+        "cloudtrail",
+        "security",
+        "tracing",
+        "billing",
+        "cost-management",
+        "finops",
+        "incident-response"
+      ],
+      "name": "observability-on-aws",
+      "source": "./plugins/observability-on-aws",
+      "tags": [
+        "aws",
+        "observability",
+        "monitoring",
+        "cloudwatch",
+        "finops"
+      ],
       "version": "1.0.0"
     },
     {
@@ -64,7 +106,12 @@
       ],
       "name": "migration-to-aws",
       "source": "./plugins/migration-to-aws",
-      "tags": ["aws", "gcp", "migration", "infrastructure"],
+      "tags": [
+        "aws",
+        "gcp",
+        "migration",
+        "infrastructure"
+      ],
       "version": "1.0.0"
     },
     {
@@ -113,7 +160,11 @@
       ],
       "name": "aws-amplify",
       "source": "./plugins/aws-amplify",
-      "tags": ["aws", "amplify", "fullstack"],
+      "tags": [
+        "aws",
+        "amplify",
+        "fullstack"
+      ],
       "version": "1.0.0"
     }
   ]

@@ -0,0 +1,29 @@
+{
+  "author": {
+    "name": "Amazon Web Services"
+  },
+  "description": "Comprehensive AWS observability and FinOps platform combining CloudWatch Logs, Metrics, Alarms, Application Signals (APM), CloudTrail security auditing, Billing & Cost Management, and automated codebase observability gap analysis for monitoring, troubleshooting, cost optimization, and incident response.",
+  "homepage": "https://github.com/awslabs/agent-plugins",
+  "keywords": [
+    "aws",
+    "observability",
+    "cloudwatch",
+    "monitoring",
+    "logs",
+    "metrics",
+    "alarms",
+    "application-signals",
+    "apm",
+    "cloudtrail",
+    "security",
+    "tracing",
+    "billing",
+    "cost-management",
+    "finops",
+    "incident-response"
+  ],
+  "license": "Apache-2.0",
+  "name": "observability-on-aws",
+  "repository": "https://github.com/awslabs/agent-plugins",
+  "version": "1.0.0"
+}
@@ -0,0 +1,52 @@
+{
+  "mcpServers": {
+    "awsknowledge": {
+      "type": "http",
+      "url": "https://knowledge-mcp.global.api.aws"
+    },
+    "awslabs.billing-cost-management-mcp-server": {
+      "args": [
+        "awslabs.billing-cost-management-mcp-server@latest"
+      ],
+      "command": "uvx",
+      "env": {
+        "AWS_PROFILE": "default",
+        "AWS_REGION": "us-east-1",
+        "FASTMCP_LOG_LEVEL": "ERROR"
+      }
+    },
+    "awslabs.cloudtrail-mcp-server": {
+      "args": [
+        "awslabs.cloudtrail-mcp-server@latest"
+      ],
+      "command": "uvx",
+      "env": {
+        "AWS_PROFILE": "default",
+        "AWS_REGION": "us-east-1",
+        "FASTMCP_LOG_LEVEL": "ERROR"
+      }
+    },
+    "awslabs.cloudwatch-applicationsignals-mcp-server": {
+      "args": [
+        "awslabs.cloudwatch-applicationsignals-mcp-server@latest"
+      ],
+      "command": "uvx",
+      "env": {
+        "AWS_PROFILE": "default",
+        "AWS_REGION": "us-east-1",
+        "FASTMCP_LOG_LEVEL": "ERROR"
+      }
+    },
+    "awslabs.cloudwatch-mcp-server": {
+      "args": [
+        "awslabs.cloudwatch-mcp-server@latest"
+      ],
+      "command": "uvx",
+      "env": {
+        "AWS_PROFILE": "default",
+        "AWS_REGION": "us-east-1",
+        "FASTMCP_LOG_LEVEL": "ERROR"
+      }
+    }
+  }
+}
@@ -0,0 +1,88 @@
+---
+name: observability-on-aws
+description: "Comprehensive AWS observability platform combining CloudWatch Logs, Metrics, Alarms, Application Signals (APM), CloudTrail security auditing, Billing & Cost Management, and automated codebase observability gap analysis. Triggers on phrases like: CloudWatch logs, metrics, alarms, monitoring, observability, application signals, APM, distributed tracing, performance, latency, errors, troubleshooting, root cause analysis, security audit, CloudTrail, log analysis, alerting, SLO, incident response, observability gaps, missing instrumentation, AWS costs, billing, cost anomaly."
+---
+
+# AWS Observability
+
+Requires AWS CLI credentials. All stdio MCP servers use `AWS_PROFILE` and `AWS_REGION` from their env config (defaults: `default` profile, `us-east-1`).
+
-
+
+Note: This plugin is read-only. It should only query and inspect AWS resources and provide recommendations. It must not provision, modify, or delete AWS resources unless the user explicitly asks for a change, and such changes should preferably be executed via a dedicated deployment or provisioning workflow/plugin.
-
+
+Note: This plugin is read-only. It should only query and inspect AWS resources and provide recommendations. It must not provision, modify, or delete AWS resources unless the user explicitly asks for a change, and such changes should preferably be executed via a dedicated deployment or provisioning workflow/plugin.
+## Capabilities
+
+| Capability                  | MCP Server                                         | Use When                                                 |
+| --------------------------- | -------------------------------------------------- | -------------------------------------------------------- |
+| CloudWatch Logs             | `awslabs.cloudwatch-mcp-server`                    | Log queries, pattern detection, anomaly analysis         |
+| Metrics & Alarms            | `awslabs.cloudwatch-mcp-server`                    | Metric data, alarm recommendations, trend analysis       |
+| Application Signals (APM)   | `awslabs.cloudwatch-applicationsignals-mcp-server` | Service health, SLOs, distributed tracing, error budgets |
+| CloudTrail Security         | `awslabs.cloudtrail-mcp-server`                    | IAM changes, resource deletions, compliance audits       |
+| Billing & Cost Management   | `awslabs.billing-cost-management-mcp-server`       | Cost analysis, forecasting, Compute Optimizer, budgets   |
+| AWS Documentation           | `awsknowledge` (HTTP)                              | Troubleshooting, best practices, API references          |
+| Codebase Observability Gaps | _(file analysis, no MCP)_                          | Identify missing logging, metrics, tracing in code       |
+
+## Workflow Decision Tree
+
+**User reports an incident or error?**
+-> Load [Incident Response](references/incident-response.md). Start with `audit_services` wildcard, then correlate alarms + logs + traces + CloudTrail changes.
+
+**User asks about logs or wants to query logs?**
+-> Load [Log Analysis](references/log-analysis.md). Use `execute_log_insights_query`. Always include `| limit` in queries.
+
+**User wants to set up or tune alarms?**
+-> Load [Alerting Setup](references/alerting-setup.md). Use `get_recommended_metric_alarms` for best-practice thresholds.
+
+**User asks about service performance, latency, or SLOs?**
+-> Load [Performance Monitoring](references/performance-monitoring.md). Start with `audit_services`, then `search_transaction_spans` for 100% trace visibility.
+
+**User needs security audit or compliance review?**
+-> Load [Security Auditing](references/security-auditing.md). Follow data source priority: CloudTrail Lake > CloudWatch Logs > Lookup Events API.
+
+**User wants to assess codebase observability?**
+-> Load [Observability Gap Analysis](references/observability-gap-analysis.md). Analyze logging, metrics, tracing, error handling, health checks.
+
+**User setting up Application Signals for the first time?**
+-> Load [Application Signals Setup](references/application-signals-setup.md). Start with `get_enablement_guide`.
+
+**Need prerequisites or configuration help?**
+-> Load [Prerequisites and Configuration](references/prerequisites.md) for IAM permissions, MCP server setup, and AWS credential configuration.
+
+## Essential Log Query Patterns
+
+### Error Search
+
+```
+fields @timestamp, @message, @logStream, level
+| filter level = "ERROR"
+| sort @timestamp desc
+| limit 100
+```
+
+### Performance Analysis
+
+```
+stats count() as requestCount,
+      avg(duration) as avgDuration,
+      pct(duration, 95) as p95Duration,
+      pct(duration, 99) as p99Duration
+by endpoint
+| filter requestCount > 10
+| sort p95Duration desc
+| limit 100
+```
+
+### Error Rate Over Time
+
+```
+stats count() as total,
+      sum(statusCode >= 500) as errors,
+      (sum(statusCode >= 500) / count()) * 100 as errorRate
+by bin(5m) as timeWindow
+| sort timeWindow
+```
+
+## Key Tool Entry Points
+
+- **Application Signals**: Start with `audit_services` using `[{"Type":"service","Data":{"Service":{"Type":"Service","Name":"*"}}}]` for wildcard discovery
+- **Logs**: Use `describe_log_groups` to discover groups, then `execute_log_insights_query`
+- **Metrics**: Use Sum for count metrics, Average for utilization, percentiles for latency
+- **CloudTrail**: Check Lake first (`list_event_data_stores`), fall back to CloudWatch Logs, then `lookup_events`
+- **Costs**: Use `cost-explorer` tool for spend analysis, `compute-optimizer` for right-sizing
@@ -0,0 +1,88 @@
+# Advanced Alerting
+
+## Purpose
+
+Advanced alerting patterns including composite alarms, anomaly detection, SLO-based alerting, and alarm tuning. For basic alarm setup and configuration patterns, see `alerting-setup.md`.
+
+## Composite Alarms
+
+**Service Health** - combine metrics to determine overall health:
+
+```
+Composite Alarm: "api-service-unhealthy"
+Logic: (high-error-rate OR high-latency) AND low-success-rate
+Components: Errors > 5%, p99 Latency > 2000ms, Success rate < 95%
+```
+
+**Dependency Failure** - detect cascading failures:
+
+```
+Composite Alarm: "service-and-dependency-down"
+Logic: service-errors AND (database-errors OR cache-errors)
+Components: Lambda Errors > 10, RDS CPU > 90%, ElastiCache Evictions > 1000
+```
+
+## Anomaly Detection Alarms
+
+**When to Use**: Metrics with predictable patterns (daily/weekly cycles), metrics where absolute thresholds are hard to define, or detecting unusual behavior vs normal patterns.
+
+```
+Metric: AWS/ApiGateway - Count | Anomaly Detection: Enabled
+Threshold: 2 standard deviations | Evaluation Period: 10 min
+Rationale: Learns normal request patterns, adapts to traffic growth over time
+```
+
+## SLO-Based Alerting
+
+Use SLO error budget consumption to drive alerting thresholds:
+
+```
+SLO: 99.9% availability (30-day window)
+Error Budget: 0.1% = 43.2 minutes downtime/month
+  Warning (50% consumed, 21.6 min): Notify team, review recent changes
+  Critical (80% consumed, 34.6 min): Page on-call, implement feature freeze
+  Emergency (100% consumed, 43.2 min): All hands, immediate mitigation
+```
+
+**Implementation**: Set up SLO in Application Signals, create CloudWatch alarm on error budget metric, configure multi-level thresholds, link to incident response procedures.
+
+## Alarm Actions
+
+- **Critical**: Page on-call (PagerDuty/Opsgenie), post to critical alerts channel, create high-priority ticket
+- **Warning**: Post to team channel, create normal-priority ticket, email team distribution list
+- **Info**: Log to monitoring system, email individual owner, no immediate action required
+
+## Alarm Tuning and Maintenance
+
+### Reducing False Positives
+
+When alarms trigger frequently without real issues (alarm fatigue):
+
+1. **Adjust Thresholds**: Review alarm history, analyze patterns, increase threshold if too sensitive, use percentiles instead of max/min
+2. **Increase Datapoints to Alarm**: Change from 1/1 to 2/3 to require sustained breach
+3. **Use Composite Alarms**: Combine multiple signals for more accurate detection
+4. **Implement Maintenance Windows**: Suppress alarms during deployments using CloudWatch alarm actions
+
+### Handling Alarm Flapping
+
+When an alarm rapidly switches between OK and ALARM:
+
+1. **Increase Evaluation Period**: Longer time windows smooth oscillations
+2. **Add Hysteresis**: Different thresholds for alarm and recovery (e.g., alarm at 80%, recover at 70%)
+3. **Use Anomaly Detection**: Adapts to patterns, less sensitive to threshold proximity
+
+## Alarm Testing
+
+**Test Checklist**: Alarm triggers on breach, recovers on return to normal, actions execute correctly, description is actionable, runbook link works, on-call receives notification within SLA.
+
+**Testing Approaches**:
+
+1. **Synthetic Testing**: Inject errors or load, verify alarm triggers, confirm notifications
+2. **Historical Analysis**: Review past incidents, check if alarm would have triggered, adjust as needed
+3. **Chaos Engineering**: Deliberately cause failures, validate detection and incident response
+
+## Integration with Incident Response
+
+**Alarm-Triggered Investigation**: Alarm triggers notification, on-call checks details, query CloudWatch Logs for errors, analyze Application Signals traces, check CloudTrail for recent changes (use data source priority), implement mitigation, update alarm if needed.
+
+**Proactive Monitoring**: Review alarm history daily, identify patterns and trends, tune thresholds before issues occur, add missing alarms for coverage gaps, document learnings in runbooks.