diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json index 32cdfbd7..902b7653 100644 --- a/.claude-plugin/marketplace.json +++ b/.claude-plugin/marketplace.json @@ -24,7 +24,12 @@ ], "name": "deploy-on-aws", "source": "./plugins/deploy-on-aws", - "tags": ["aws", "deploy", "infrastructure", "cdk"], + "tags": [ + "aws", + "deploy", + "infrastructure", + "cdk" + ], "version": "1.1.0" }, { @@ -45,7 +50,44 @@ ], "name": "amazon-location-service", "source": "./plugins/amazon-location-service", - "tags": ["aws", "location", "maps", "geospatial"], + "tags": [ + "aws", + "location", + "maps", + "geospatial" + ], + "version": "1.0.0" + }, + { + "category": "observability", + "description": "Comprehensive AWS observability and FinOps platform combining CloudWatch Logs, Metrics, Alarms, Application Signals (APM), CloudTrail security auditing, Billing & Cost Management, and automated codebase observability gap analysis.", + "keywords": [ + "aws", + "observability", + "cloudwatch", + "monitoring", + "logs", + "metrics", + "alarms", + "application-signals", + "apm", + "cloudtrail", + "security", + "tracing", + "billing", + "cost-management", + "finops", + "incident-response" + ], + "name": "observability-on-aws", + "source": "./plugins/observability-on-aws", + "tags": [ + "aws", + "observability", + "monitoring", + "cloudwatch", + "finops" + ], "version": "1.0.0" }, { @@ -64,7 +106,12 @@ ], "name": "migration-to-aws", "source": "./plugins/migration-to-aws", - "tags": ["aws", "gcp", "migration", "infrastructure"], + "tags": [ + "aws", + "gcp", + "migration", + "infrastructure" + ], "version": "1.0.0" }, { @@ -113,7 +160,11 @@ ], "name": "aws-amplify", "source": "./plugins/aws-amplify", - "tags": ["aws", "amplify", "fullstack"], + "tags": [ + "aws", + "amplify", + "fullstack" + ], "version": "1.0.0" } ] diff --git a/plugins/observability-on-aws/.claude-plugin/plugin.json b/plugins/observability-on-aws/.claude-plugin/plugin.json new file mode 100644 index 00000000..96c07b00 --- /dev/null +++ b/plugins/observability-on-aws/.claude-plugin/plugin.json @@ -0,0 +1,29 @@ +{ + "author": { + "name": "Amazon Web Services" + }, + "description": "Comprehensive AWS observability and FinOps platform combining CloudWatch Logs, Metrics, Alarms, Application Signals (APM), CloudTrail security auditing, Billing & Cost Management, and automated codebase observability gap analysis for monitoring, troubleshooting, cost optimization, and incident response.", + "homepage": "https://github.com/awslabs/agent-plugins", + "keywords": [ + "aws", + "observability", + "cloudwatch", + "monitoring", + "logs", + "metrics", + "alarms", + "application-signals", + "apm", + "cloudtrail", + "security", + "tracing", + "billing", + "cost-management", + "finops", + "incident-response" + ], + "license": "Apache-2.0", + "name": "observability-on-aws", + "repository": "https://github.com/awslabs/agent-plugins", + "version": "1.0.0" +} diff --git a/plugins/observability-on-aws/.mcp.json b/plugins/observability-on-aws/.mcp.json new file mode 100644 index 00000000..9425ccca --- /dev/null +++ b/plugins/observability-on-aws/.mcp.json @@ -0,0 +1,52 @@ +{ + "mcpServers": { + "awsknowledge": { + "type": "http", + "url": "https://knowledge-mcp.global.api.aws" + }, + "awslabs.billing-cost-management-mcp-server": { + "args": [ + "awslabs.billing-cost-management-mcp-server@latest" + ], + "command": "uvx", + "env": { + "AWS_PROFILE": "default", + "AWS_REGION": "us-east-1", + "FASTMCP_LOG_LEVEL": "ERROR" + } + }, + "awslabs.cloudtrail-mcp-server": { + "args": [ + "awslabs.cloudtrail-mcp-server@latest" + ], + "command": "uvx", + "env": { + "AWS_PROFILE": "default", + "AWS_REGION": "us-east-1", + "FASTMCP_LOG_LEVEL": "ERROR" + } + }, + "awslabs.cloudwatch-applicationsignals-mcp-server": { + "args": [ + "awslabs.cloudwatch-applicationsignals-mcp-server@latest" + ], + "command": "uvx", + "env": { + "AWS_PROFILE": "default", + "AWS_REGION": "us-east-1", + "FASTMCP_LOG_LEVEL": "ERROR" + } + }, + "awslabs.cloudwatch-mcp-server": { + "args": [ + "awslabs.cloudwatch-mcp-server@latest" + ], + "command": "uvx", + "env": { + "AWS_PROFILE": "default", + "AWS_REGION": "us-east-1", + "FASTMCP_LOG_LEVEL": "ERROR" + } + } + } +} diff --git a/plugins/observability-on-aws/skills/observability-on-aws/SKILL.md b/plugins/observability-on-aws/skills/observability-on-aws/SKILL.md new file mode 100644 index 00000000..a11e1751 --- /dev/null +++ b/plugins/observability-on-aws/skills/observability-on-aws/SKILL.md @@ -0,0 +1,88 @@ +--- +name: observability-on-aws +description: "Comprehensive AWS observability platform combining CloudWatch Logs, Metrics, Alarms, Application Signals (APM), CloudTrail security auditing, Billing & Cost Management, and automated codebase observability gap analysis. Triggers on phrases like: CloudWatch logs, metrics, alarms, monitoring, observability, application signals, APM, distributed tracing, performance, latency, errors, troubleshooting, root cause analysis, security audit, CloudTrail, log analysis, alerting, SLO, incident response, observability gaps, missing instrumentation, AWS costs, billing, cost anomaly." +--- + +# AWS Observability + +Requires AWS CLI credentials. All stdio MCP servers use `AWS_PROFILE` and `AWS_REGION` from their env config (defaults: `default` profile, `us-east-1`). + +## Capabilities + +| Capability | MCP Server | Use When | +| --------------------------- | -------------------------------------------------- | -------------------------------------------------------- | +| CloudWatch Logs | `awslabs.cloudwatch-mcp-server` | Log queries, pattern detection, anomaly analysis | +| Metrics & Alarms | `awslabs.cloudwatch-mcp-server` | Metric data, alarm recommendations, trend analysis | +| Application Signals (APM) | `awslabs.cloudwatch-applicationsignals-mcp-server` | Service health, SLOs, distributed tracing, error budgets | +| CloudTrail Security | `awslabs.cloudtrail-mcp-server` | IAM changes, resource deletions, compliance audits | +| Billing & Cost Management | `awslabs.billing-cost-management-mcp-server` | Cost analysis, forecasting, Compute Optimizer, budgets | +| AWS Documentation | `awsknowledge` (HTTP) | Troubleshooting, best practices, API references | +| Codebase Observability Gaps | _(file analysis, no MCP)_ | Identify missing logging, metrics, tracing in code | + +## Workflow Decision Tree + +**User reports an incident or error?** +-> Load [Incident Response](references/incident-response.md). Start with `audit_services` wildcard, then correlate alarms + logs + traces + CloudTrail changes. + +**User asks about logs or wants to query logs?** +-> Load [Log Analysis](references/log-analysis.md). Use `execute_log_insights_query`. Always include `| limit` in queries. + +**User wants to set up or tune alarms?** +-> Load [Alerting Setup](references/alerting-setup.md). Use `get_recommended_metric_alarms` for best-practice thresholds. + +**User asks about service performance, latency, or SLOs?** +-> Load [Performance Monitoring](references/performance-monitoring.md). Start with `audit_services`, then `search_transaction_spans` for 100% trace visibility. + +**User needs security audit or compliance review?** +-> Load [Security Auditing](references/security-auditing.md). Follow data source priority: CloudTrail Lake > CloudWatch Logs > Lookup Events API. + +**User wants to assess codebase observability?** +-> Load [Observability Gap Analysis](references/observability-gap-analysis.md). Analyze logging, metrics, tracing, error handling, health checks. + +**User setting up Application Signals for the first time?** +-> Load [Application Signals Setup](references/application-signals-setup.md). Start with `get_enablement_guide`. + +**Need prerequisites or configuration help?** +-> Load [Prerequisites and Configuration](references/prerequisites.md) for IAM permissions, MCP server setup, and AWS credential configuration. + +## Essential Log Query Patterns + +### Error Search + +``` +fields @timestamp, @message, @logStream, level +| filter level = "ERROR" +| sort @timestamp desc +| limit 100 +``` + +### Performance Analysis + +``` +stats count() as requestCount, + avg(duration) as avgDuration, + pct(duration, 95) as p95Duration, + pct(duration, 99) as p99Duration +by endpoint +| filter requestCount > 10 +| sort p95Duration desc +| limit 100 +``` + +### Error Rate Over Time + +``` +stats count() as total, + sum(statusCode >= 500) as errors, + (sum(statusCode >= 500) / count()) * 100 as errorRate +by bin(5m) as timeWindow +| sort timeWindow +``` + +## Key Tool Entry Points + +- **Application Signals**: Start with `audit_services` using `[{"Type":"service","Data":{"Service":{"Type":"Service","Name":"*"}}}]` for wildcard discovery +- **Logs**: Use `describe_log_groups` to discover groups, then `execute_log_insights_query` +- **Metrics**: Use Sum for count metrics, Average for utilization, percentiles for latency +- **CloudTrail**: Check Lake first (`list_event_data_stores`), fall back to CloudWatch Logs, then `lookup_events` +- **Costs**: Use `cost-explorer` tool for spend analysis, `compute-optimizer` for right-sizing diff --git a/plugins/observability-on-aws/skills/observability-on-aws/references/alerting-advanced.md b/plugins/observability-on-aws/skills/observability-on-aws/references/alerting-advanced.md new file mode 100644 index 00000000..1745c7be --- /dev/null +++ b/plugins/observability-on-aws/skills/observability-on-aws/references/alerting-advanced.md @@ -0,0 +1,88 @@ +# Advanced Alerting + +## Purpose + +Advanced alerting patterns including composite alarms, anomaly detection, SLO-based alerting, and alarm tuning. For basic alarm setup and configuration patterns, see `alerting-setup.md`. + +## Composite Alarms + +**Service Health** - combine metrics to determine overall health: + +``` +Composite Alarm: "api-service-unhealthy" +Logic: (high-error-rate OR high-latency) AND low-success-rate +Components: Errors > 5%, p99 Latency > 2000ms, Success rate < 95% +``` + +**Dependency Failure** - detect cascading failures: + +``` +Composite Alarm: "service-and-dependency-down" +Logic: service-errors AND (database-errors OR cache-errors) +Components: Lambda Errors > 10, RDS CPU > 90%, ElastiCache Evictions > 1000 +``` + +## Anomaly Detection Alarms + +**When to Use**: Metrics with predictable patterns (daily/weekly cycles), metrics where absolute thresholds are hard to define, or detecting unusual behavior vs normal patterns. + +``` +Metric: AWS/ApiGateway - Count | Anomaly Detection: Enabled +Threshold: 2 standard deviations | Evaluation Period: 10 min +Rationale: Learns normal request patterns, adapts to traffic growth over time +``` + +## SLO-Based Alerting + +Use SLO error budget consumption to drive alerting thresholds: + +``` +SLO: 99.9% availability (30-day window) +Error Budget: 0.1% = 43.2 minutes downtime/month + Warning (50% consumed, 21.6 min): Notify team, review recent changes + Critical (80% consumed, 34.6 min): Page on-call, implement feature freeze + Emergency (100% consumed, 43.2 min): All hands, immediate mitigation +``` + +**Implementation**: Set up SLO in Application Signals, create CloudWatch alarm on error budget metric, configure multi-level thresholds, link to incident response procedures. + +## Alarm Actions + +- **Critical**: Page on-call (PagerDuty/Opsgenie), post to critical alerts channel, create high-priority ticket +- **Warning**: Post to team channel, create normal-priority ticket, email team distribution list +- **Info**: Log to monitoring system, email individual owner, no immediate action required + +## Alarm Tuning and Maintenance + +### Reducing False Positives + +When alarms trigger frequently without real issues (alarm fatigue): + +1. **Adjust Thresholds**: Review alarm history, analyze patterns, increase threshold if too sensitive, use percentiles instead of max/min +2. **Increase Datapoints to Alarm**: Change from 1/1 to 2/3 to require sustained breach +3. **Use Composite Alarms**: Combine multiple signals for more accurate detection +4. **Implement Maintenance Windows**: Suppress alarms during deployments using CloudWatch alarm actions + +### Handling Alarm Flapping + +When an alarm rapidly switches between OK and ALARM: + +1. **Increase Evaluation Period**: Longer time windows smooth oscillations +2. **Add Hysteresis**: Different thresholds for alarm and recovery (e.g., alarm at 80%, recover at 70%) +3. **Use Anomaly Detection**: Adapts to patterns, less sensitive to threshold proximity + +## Alarm Testing + +**Test Checklist**: Alarm triggers on breach, recovers on return to normal, actions execute correctly, description is actionable, runbook link works, on-call receives notification within SLA. + +**Testing Approaches**: + +1. **Synthetic Testing**: Inject errors or load, verify alarm triggers, confirm notifications +2. **Historical Analysis**: Review past incidents, check if alarm would have triggered, adjust as needed +3. **Chaos Engineering**: Deliberately cause failures, validate detection and incident response + +## Integration with Incident Response + +**Alarm-Triggered Investigation**: Alarm triggers notification, on-call checks details, query CloudWatch Logs for errors, analyze Application Signals traces, check CloudTrail for recent changes (use data source priority), implement mitigation, update alarm if needed. + +**Proactive Monitoring**: Review alarm history daily, identify patterns and trends, tune thresholds before issues occur, add missing alarms for coverage gaps, document learnings in runbooks. diff --git a/plugins/observability-on-aws/skills/observability-on-aws/references/alerting-setup.md b/plugins/observability-on-aws/skills/observability-on-aws/references/alerting-setup.md new file mode 100644 index 00000000..35a3ae47 --- /dev/null +++ b/plugins/observability-on-aws/skills/observability-on-aws/references/alerting-setup.md @@ -0,0 +1,95 @@ +# Alerting Setup + +## Purpose + +Guidance for setting up effective CloudWatch alarms using recommended configurations. For composite alarms, anomaly detection, SLO alerting, and tuning, see `alerting-advanced.md`. + +## Core Concepts + +**Alarm States**: OK (within threshold), ALARM (breached), INSUFFICIENT_DATA (not enough data). + +**Alarm Components**: Metric (what to monitor), Statistic (how to aggregate), Threshold (trigger value), Evaluation Period (time window), Datapoints to Alarm (how many periods must breach). + +**Alarm Types**: Metric Alarm (single metric), Composite Alarm (combines alarms with AND/OR/NOT), Anomaly Detection Alarm (ML-based). + +## Getting Recommended Alarm Configurations + +Use `get_recommended_metric_alarms` with namespace, metric_name, dimensions, and statistic. Returns recommended threshold, evaluation periods, datapoints to alarm, description, and rationale. + +## Choosing the Right Statistic (must match metric type) + +- **Count** (use `Sum`): Errors, Faults, Throttles, Invocations, RequestCount +- **Utilization** (use `Average`): CPUUtilization, MemoryUtilization, DiskUtilization +- **Latency/Time** (use `Average` or percentiles): Duration, Latency, ResponseTime +- **Size** (use `Average`): PayloadSize, MessageSize, RequestSize + +## Alarm Configuration Patterns + +### Lambda Function Errors + +``` +Metric: AWS/Lambda - Errors | Statistic: Sum | Threshold: > 5 errors +Evaluation Period: 5 min | Datapoints: 2 of 3 +Rationale: Sum for count metrics; 2/3 reduces false positives +``` + +### EC2 CPU Utilization + +``` +Metric: AWS/EC2 - CPUUtilization | Statistic: Average | Threshold: > 80% +Evaluation Period: 5 min | Datapoints: 3 of 3 +Rationale: Average smooths spikes; 3/3 ensures sustained high CPU +``` + +### API Gateway Latency + +``` +Metric: AWS/ApiGateway - Latency | Statistic: p99 | Threshold: > 1000ms +Evaluation Period: 5 min | Datapoints: 2 of 3 +Rationale: p99 catches tail latency; 1000ms threshold for user experience +``` + +### RDS Database Connections + +``` +Metric: AWS/RDS - DatabaseConnections | Statistic: Average | Threshold: > 80% of max +Evaluation Period: 5 min | Datapoints: 3 of 3 +Rationale: Average shows sustained usage; prevents connection pool exhaustion +``` + +### DynamoDB Throttles + +``` +Metric: AWS/DynamoDB - ReadThrottleEvents / WriteThrottleEvents | Statistic: Sum +Threshold: > 10 throttle events | Evaluation Period: 1 min | Datapoints: 2 of 2 +Rationale: Sum for event counts; 1-min for quick detection; 2/2 confirms sustained throttling +``` + +## Alarm Best Practices + +**Naming Convention**: `[severity]-[service]-[metric]-[condition]` (e.g., `critical-api-error-rate-high`) + +**Descriptions** - use template: `"[Service] [Metric] is [Condition]. Current value: {{value}}. Threshold: {{threshold}}. Runbook: [link]. Dashboard: [link]."` + +**Treat Missing Data**: `notBreaching` for most alarms, `breaching` when missing data indicates a problem, `ignore` for sparse data. + +**Evaluation Periods**: Fast (1-2 min) for critical/user-facing. Balanced (5 min) for most metrics. Slow (10-15 min) for cost/capacity. + +## Quick Reference + +``` +# Lambda Errors +get_recommended_metric_alarms(namespace="AWS/Lambda", metric_name="Errors", + dimensions=[{name: "FunctionName", value: "my-function"}], statistic="Sum") +# EC2 CPU +get_recommended_metric_alarms(namespace="AWS/EC2", metric_name="CPUUtilization", + dimensions=[{name: "InstanceId", value: "i-1234567890abcdef0"}], statistic="Average") +# API Gateway Latency +get_recommended_metric_alarms(namespace="AWS/ApiGateway", metric_name="Latency", + dimensions=[{name: "ApiName", value: "my-api"}], statistic="p99") +# Check active alarms +get_active_alarms(state_value="ALARM") +# Review alarm history +get_alarm_history(alarm_name="my-alarm", history_item_type="StateUpdate", + start_time="2026-01-01T00:00:00Z", end_time="2026-01-20T00:00:00Z") +``` diff --git a/plugins/observability-on-aws/skills/observability-on-aws/references/application-signals-setup.md b/plugins/observability-on-aws/skills/observability-on-aws/references/application-signals-setup.md new file mode 100644 index 00000000..22956aeb --- /dev/null +++ b/plugins/observability-on-aws/skills/observability-on-aws/references/application-signals-setup.md @@ -0,0 +1,220 @@ +# Application Signals Setup and Enablement Guide + +This reference provides comprehensive guidance for setting up AWS Application Signals using the plugin's enablement guide feature. + +## Quick Start: Get Enablement Guide + +**First Step**: Always start by getting the official enablement guide from AWS: + +``` +Use the awslabs.cloudwatch-applicationsignals-mcp-server's get_enablement_guide tool with the following required parameters: + - service_platform (required): "ec2", "ecs", "lambda", or "eks" + - service_language (required): "python", "nodejs", "java", or "dotnet" + - iac_directory (required): Absolute path to your Infrastructure as Code directory + - app_directory (required): Absolute path to your application code directory +``` + +This tool provides: + +- Current prerequisites and requirements +- Step-by-step enablement instructions +- Service instrumentation guidance +- Best practices and recommendations +- Troubleshooting common issues + +## Application Signals Setup Workflow + +### Phase 1: Prerequisites Check + +Before enabling Application Signals, verify: + +1. **AWS Account Requirements**: + - Application Signals is available in your region + - Sufficient IAM permissions for CloudWatch and X-Ray + - EC2/ECS/EKS/Lambda services are running + +2. **Service Requirements**: + - Applications are instrumented with AWS X-Ray SDK or OpenTelemetry + - Services are running in supported compute environments + - Network connectivity allows telemetry data transmission + +### Phase 2: Enable Application Signals + +1. **Get the Latest Guide**: + + ``` + Use get_enablement_guide to get current setup instructions + ``` + +2. **Enable at Account Level**: + - Use AWS Console or CLI to enable Application Signals + - Configure service discovery settings + - Set up data retention policies + +3. **Verify Enablement**: + - Check that Application Signals is active + - Confirm data ingestion is working + - Validate service discovery + +### Phase 3: Service Instrumentation + +Based on the enablement guide, instrument your services: + +1. **For Lambda Functions**: + - Enable X-Ray tracing + - Add AWS X-Ray SDK to function code + - Configure environment variables + +2. **For ECS/EKS Services**: + - Deploy X-Ray daemon as sidecar + - Configure service mesh integration (if applicable) + - Set up OpenTelemetry collectors + +3. **For EC2 Applications**: + - Install X-Ray daemon + - Configure application instrumentation + - Set up auto-discovery tags + +### Phase 4: Configure Monitoring + +1. **Service Level Objectives (SLOs)**: + - Define availability targets + - Set latency thresholds + - Configure error rate limits + +2. **Dashboards and Alarms**: + - Create service overview dashboards + - Set up performance alarms + - Configure notification channels + +## Common Setup Patterns + +### Pattern 1: Microservices Architecture + +For distributed microservices: + +1. Get enablement guide for microservices setup +2. Enable Application Signals at the account level +3. Instrument each service with X-Ray SDK +4. Configure service maps and dependencies +5. Set up cross-service SLOs + +### Pattern 2: Serverless Applications + +For Lambda-based applications: + +1. Get enablement guide for serverless setup +2. Enable X-Ray tracing on all Lambda functions +3. Configure API Gateway integration +4. Set up end-to-end tracing +5. Create function-level SLOs + +### Pattern 3: Container Workloads + +For ECS/EKS applications: + +1. Get enablement guide for container setup +2. Deploy X-Ray daemon containers +3. Configure service discovery +4. Set up cluster-level monitoring +5. Create pod/task-level SLOs + +## Troubleshooting Setup Issues + +### Common Problems and Solutions + +1. **No Data Appearing**: + - Re-run get_enablement_guide for latest troubleshooting steps + - Verify service instrumentation + - Check IAM permissions + - Validate network connectivity + +2. **Incomplete Service Maps**: + - Ensure all services are instrumented + - Check X-Ray sampling rules + - Verify service naming consistency + +3. **Missing Metrics**: + - Confirm Application Signals is enabled + - Check service discovery configuration + - Validate metric emission + +## Best Practices from Enablement Guide + +Always refer to the latest enablement guide, but common best practices include: + +1. **Instrumentation**: + - Use consistent service naming + - Implement proper error handling + - Configure appropriate sampling rates + +2. **Monitoring**: + - Start with basic SLOs and iterate + - Use composite alarms for complex scenarios + - Set up proper alerting channels + +3. **Performance**: + - Monitor instrumentation overhead + - Optimize sampling configurations + - Use async telemetry transmission + +## Integration with Other Plugin Features + +### Combine with CloudWatch Logs + +Use Logs Insights queries to correlate Application Signals data with logs: + +``` +fields @timestamp, @message, traceId, duration +| filter ispresent(traceId) +| sort duration desc +| limit 100 +``` + +### Combine with CloudTrail + +Track Application Signals configuration changes: + +- Monitor EnableApplicationSignals API calls +- Track service configuration modifications +- Audit SLO changes + +## Validation Steps + +After setup, validate your Application Signals implementation: + +1. **Data Flow Validation**: + - Confirm traces are appearing in X-Ray + - Verify metrics in CloudWatch + - Check service maps are populated + +2. **SLO Validation**: + - Test SLO calculations + - Verify alarm triggering + - Confirm notification delivery + +3. **Performance Validation**: + - Monitor instrumentation overhead + - Check data ingestion costs + - Validate query performance + +## Next Steps After Setup + +Once Application Signals is enabled: + +1. Create comprehensive service dashboards +2. Set up automated alerting workflows +3. Implement SLO-based incident response +4. Train team on Application Signals features +5. Establish monitoring best practices + +## Resources + +- Use `get_enablement_guide` tool for latest official documentation +- [Application Signals User Guide](https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/CloudWatch-Application-Signals.html) +- [X-Ray Developer Guide](https://docs.aws.amazon.com/xray/latest/devguide/) +- [OpenTelemetry Documentation](https://opentelemetry.io/docs/) + +--- + +**Remember**: Always start with the `get_enablement_guide` tool to get the most current and accurate setup instructions for your specific environment and use case. diff --git a/plugins/observability-on-aws/skills/observability-on-aws/references/cloudtrail-data-source-selection.md b/plugins/observability-on-aws/skills/observability-on-aws/references/cloudtrail-data-source-selection.md new file mode 100644 index 00000000..f5492d17 --- /dev/null +++ b/plugins/observability-on-aws/skills/observability-on-aws/references/cloudtrail-data-source-selection.md @@ -0,0 +1,92 @@ +# CloudTrail Data Source Selection Strategy + +## Purpose + +This is a utility guide referenced by `security-auditing.md` and other reference files for CloudTrail data access priority logic. It is not intended to be loaded directly in response to user queries. + +## Data Source Priority + +### Priority 1: CloudTrail Lake (Event Data Store) + +**Tool**: `list_event_data_stores` from CloudTrail MCP server + +**When to use**: Event data store exists with management events enabled. + +- Native SQL support for complex queries +- Configurable retention (up to 2,557 days / ~7 years) +- Cross-account and cross-region aggregation + +### Priority 2: CloudWatch Logs (CloudTrail Integration) + +**Tool**: `describe_log_groups` from CloudWatch MCP server + +**When to use**: CloudTrail is configured to send events to CloudWatch Logs. + +- Real-time event streaming with alarm integration +- CloudWatch Logs Insights query support +- Common log group patterns: `/aws/cloudtrail/logs`, `/aws/cloudtrail/`, `CloudTrail/logs` + +### Priority 3: CloudTrail Lookup Events API + +**Tool**: `lookup_events` from CloudTrail MCP server + +**When to use**: Neither CloudTrail Lake nor CloudWatch Logs available. + +- Only last 90 days of events +- Limited to 50 results per API call +- Basic filtering only (no SQL support) + +## Implementation Workflow + +1. **Check CloudTrail Lake**: Call `list_event_data_stores`. If an enabled store with management events exists, use `lake_query` for SQL-based analysis. +2. **Check CloudWatch Logs**: Call `describe_log_groups` searching for "cloudtrail". If found, use `execute_log_insights_query`. +3. **Fall back to Lookup Events**: Use `lookup_events` with `LookupAttributes`. Limited to 90 days and 50 results per call. + +## Decision Tree + +``` +Need CloudTrail Data? + | + v +Check CloudTrail Lake (list_event_data_stores) + +-- Enabled? --> YES --> Use lake_query + +-- NO --> Check CloudWatch Logs (describe_log_groups) + +-- CloudTrail Log Group? --> YES --> Use execute_log_insights_query + +-- NO --> Use lookup_events (90 days, basic filtering) +``` + +## Query Translation Example: Find IAM User Deletions + +**CloudTrail Lake (SQL)**: + +```sql +SELECT eventTime, userIdentity.userName, + requestParameters.userName AS deletedUser, sourceIPAddress +FROM +WHERE eventName = 'DeleteUser' + AND eventTime > timestamp '2024-01-01 00:00:00' +ORDER BY eventTime DESC +``` + +**CloudWatch Logs Insights**: + +``` +fields eventTime, userIdentity.userName, requestParameters.userName, sourceIPAddress +| filter eventName = "DeleteUser" +| sort eventTime desc | limit 50 +``` + +**Lookup Events API**: + +``` +lookup_events( + LookupAttributes=[{'AttributeKey': 'EventName', 'AttributeValue': 'DeleteUser'}], + StartTime='2024-01-01T00:00:00Z', MaxResults=50 +) +``` + +## Error Handling + +- **CloudTrail Lake not available**: `list_event_data_stores` returns empty or no enabled stores. Proceed to Priority 2. +- **CloudWatch Logs not available**: No CloudTrail log groups found. Proceed to Priority 3. +- **Lookup Events limits exceeded**: Data older than 90 days or complex filtering needed. Inform user that CloudTrail Lake or CloudWatch Logs integration is required. diff --git a/plugins/observability-on-aws/skills/observability-on-aws/references/incident-patterns.md b/plugins/observability-on-aws/skills/observability-on-aws/references/incident-patterns.md new file mode 100644 index 00000000..90b049a6 --- /dev/null +++ b/plugins/observability-on-aws/skills/observability-on-aws/references/incident-patterns.md @@ -0,0 +1,75 @@ +# Common Incident Patterns + +## Purpose + +Common incident patterns with investigation approaches and prevention strategies. For the overall incident response framework, see `incident-response.md`. For detailed log query syntax, see `log-analysis.md`. + +## Pattern 1: Deployment-Related Incident + +**Symptoms**: + +- Errors spike immediately after deployment +- Specific service shows elevated error rate +- Traces show new error types + +**Investigation**: Query logs for errors after deployment timestamp. Group errors by type to identify new patterns. Compare error types before and after deployment. See `log-analysis.md` for query syntax. + +**Mitigation**: Rollback deployment + +**Prevention**: Implement canary deployments, add integration tests, improve staging environment + +## Pattern 2: Resource Exhaustion + +**Symptoms**: + +- Gradual performance degradation +- Timeouts and connection errors +- High CPU or memory utilization + +**Investigation**: Check CloudWatch Metrics for resource utilization. Query logs for timeout errors. Review Application Signals for latency increases. Check Billing & Cost Management for usage spikes. + +**Mitigation**: Scale resources, optimize code + +**Prevention**: Set up auto-scaling, implement resource limits, add capacity planning alarms + +## Pattern 3: Dependency Failure + +**Symptoms**: + +- Errors calling external service +- Traces show failures in downstream calls +- Service map shows unhealthy dependency + +**Investigation**: Filter logs for dependency-related errors. Parse service names and status codes. Correlate with Application Signals service map. Check traces for downstream call failures. See `log-analysis.md` for query syntax. + +**Mitigation**: Implement circuit breaker, add fallback behavior, route around failed dependency + +**Prevention**: Add dependency health checks, implement retry logic with backoff, set up dependency SLOs + +## Pattern 4: Database Performance + +**Symptoms**: + +- Slow query performance +- Database connection pool exhaustion +- High database CPU utilization + +**Investigation**: Parse SQL queries and durations from logs. Filter for slow queries (e.g., duration > 1000ms). Aggregate by query pattern to find the most impactful queries. Correlate with database CloudWatch Metrics (CPU, connections, IOPS). See `log-analysis.md` for query syntax. + +**Mitigation**: Add database indexes, optimize queries, scale database resources, implement query caching + +**Prevention**: Regular query performance reviews, database monitoring and alerting, connection pool tuning + +## Pattern 5: Traffic Spike + +**Symptoms**: + +- Sudden increase in request volume +- Rate limiting errors +- Resource exhaustion + +**Investigation**: Check CloudWatch Metrics for request rates. Query logs for request patterns. Review Application Signals for traffic sources. Check Billing & Cost Management for usage spikes. + +**Mitigation**: Enable auto-scaling, implement rate limiting, add caching layer + +**Prevention**: Capacity planning, load testing, auto-scaling configuration, DDoS protection diff --git a/plugins/observability-on-aws/skills/observability-on-aws/references/incident-postmortem.md b/plugins/observability-on-aws/skills/observability-on-aws/references/incident-postmortem.md new file mode 100644 index 00000000..28cca0f7 --- /dev/null +++ b/plugins/observability-on-aws/skills/observability-on-aws/references/incident-postmortem.md @@ -0,0 +1,108 @@ +# Root Cause Analysis and Postmortems + +## Purpose + +Root cause analysis methodology and postmortem templates. For the overall incident response framework, see `incident-response.md`. For common incident patterns, see `incident-patterns.md`. + +## Phase 5: Root Cause Analysis + +### Timeline Construction + +Create a detailed timeline from first anomaly to resolution. Mark key milestones: incident start, detection, mitigation, and resolution. + +### Five Whys Analysis + +``` +Problem: API returned 500 errors +Why? Lambda function timed out +Why? Database queries were slow +Why? Database was under heavy load +Why? New feature caused N+1 query problem +Why? Code review didn't catch the inefficient query pattern +Root Cause: Insufficient code review process for database queries +``` + +### Contributing Factors + +- **Technical**: Code bugs, configuration errors, infrastructure limits +- **Process**: Inadequate testing, missing monitoring, insufficient review +- **Human**: Knowledge gaps, communication issues, alert fatigue + +### Evidence Collection + +- Log excerpts showing errors (see `log-analysis.md` for query patterns) +- Metric graphs showing anomalies +- Trace examples demonstrating failures (see `performance-monitoring.md`) +- CloudTrail events showing changes (see `security-auditing.md`) +- Cost data showing resource usage + +## Phase 6: Postmortem and Prevention + +### Postmortem Template + +```markdown +# Incident Postmortem: [Incident Title] + +## Summary + +- **Date**: YYYY-MM-DD +- **Duration**: X hours Y minutes +- **Severity**: SEVX +- **Impact**: Description of user impact +- **Root Cause**: Brief root cause statement + +## Timeline + +- HH:MM - Incident began +- HH:MM - First alert triggered +- HH:MM - Root cause identified +- HH:MM - Mitigation applied +- HH:MM - Service restored + +## What Happened + +Detailed description of the incident + +## Root Cause + +Detailed root cause analysis with evidence + +## Resolution + +How the incident was resolved + +## Impact + +- Users affected: X | Requests failed: Y | SLO budget consumed: Z% + +## Action Items + +1. [ ] Immediate fix (Owner, Due Date) +2. [ ] Monitoring improvement (Owner, Due Date) +3. [ ] Process change (Owner, Due Date) + +## Lessons Learned + +What went well and what could be improved +``` + +### Prevention Strategies + +- **Monitoring**: Add missing alarms, improve thresholds, create composite alarms, set up SLOs +- **Testing**: Add failure scenario tests, implement chaos engineering, load test for capacity +- **Process**: Update deployment procedures, improve code review checklists, implement gradual rollouts +- **Architecture**: Add redundancy for single points of failure, implement circuit breakers, add caching + +## Example: Complete Incident Investigation + +**Scenario**: API returning 500 errors + +1. **Detect**: CloudWatch alarm "api-error-rate" triggers +2. **Assess**: Application Signals shows 15% error rate on api-service (normal: 0.1%) +3. **Query Logs**: Error logs show database timeout exceptions (see `log-analysis.md`) +4. **Trace**: Application Signals traces show timeout calling database +5. **Metrics**: RDS CPUUtilization at 95% +6. **CloudTrail**: Check for recent changes (see `cloudtrail-data-source-selection.md`) +7. **Analyze Logs**: Slow query analysis reveals N+1 pattern - 10,000+ queries per request +8. **Root Cause**: New deployment introduced N+1 query causing database overload +9. **Mitigate**: Rollback deployment, verify error rate returns to normal, fix N+1 query diff --git a/plugins/observability-on-aws/skills/observability-on-aws/references/incident-response.md b/plugins/observability-on-aws/skills/observability-on-aws/references/incident-response.md new file mode 100644 index 00000000..ac1f3f2c --- /dev/null +++ b/plugins/observability-on-aws/skills/observability-on-aws/references/incident-response.md @@ -0,0 +1,100 @@ +# Incident Response and Troubleshooting + +## Purpose + +Guidance for responding to incidents using the full AWS observability stack. For common incident patterns, see `incident-patterns.md`. For root cause analysis and postmortem templates, see `incident-postmortem.md`. + +## Incident Response Framework + +### Phase 1: Detection and Triage + +**Severity Classification**: + +- **SEV1 (Critical)**: Complete service outage, data loss, security breach +- **SEV2 (High)**: Major functionality impaired, significant user impact +- **SEV3 (Medium)**: Partial functionality impaired, workaround available +- **SEV4 (Low)**: Minor issue, minimal user impact +- **SEV5 (Informational)**: No immediate impact, cosmetic or non-urgent + +**Actions**: + +1. **Check Active Alarms** - Query CloudWatch for alarms in ALARM state, review alarm history and timing +2. **Review Application Signals** - Check service health, SLO status, error rates, and service maps +3. **Assess Impact** - Query logs for error volume, check request counts and success rates + +### Phase 2: Investigation + +**Data Collection Sources**: + +1. **CloudWatch Logs Insights** - See `log-analysis.md` for detailed query patterns. Key query for incident context: + + ``` + # Quick error snapshot for incident timeframe + fields @timestamp, @logStream, @message, level, errorType, requestId + | filter level = "ERROR" + | sort @timestamp asc + | limit 100 + ``` + +2. **Application Signals Traces** - Search for failed traces, analyze timelines for bottlenecks, examine error spans +3. **CloudWatch Metrics** - Compare affected resource metrics with baseline, check for resource exhaustion +4. **CloudTrail Events** - See `security-auditing.md` for detailed patterns. Follow data source priority from `cloudtrail-data-source-selection.md`. Query for recent configuration changes and deployments. + +### Phase 3: Mitigation + +**Common Mitigation Strategies**: + +1. **Rollback Deployment** - Check CloudTrail for recent deployments, rollback to previous stable version +2. **Scale Resources** - Increase capacity (EC2, Lambda concurrency), add read replicas, enable auto-scaling +3. **Circuit Breaker** - Route traffic away from failing dependency, enable degraded mode +4. **Rate Limiting** - Implement rate limiting at API Gateway, block malicious IPs, enable WAF rules +5. **Database Optimization** - Identify slow queries, add indexes, scale database resources + +### Phase 4: Recovery Verification + +**Verification Steps**: + +1. **Check Alarms** - Verify alarms returned to OK state, monitor for flapping +2. **Validate Application Signals** - Confirm error rates normalized and latency within SLO targets +3. **Query Logs**: + + ``` + # Verify error rate has decreased + fields @timestamp, level + | stats count(*) as totalLogs, + sum(level = "ERROR") as errorCount + by bin(1m) + | sort @timestamp asc + | limit 60 + ``` + +4. **Monitor Metrics** - Verify CPU, memory, network, request rates, and latencies + +## Complete Investigation Workflow + +1. Detection (CloudWatch Alarms) +2. Service Health (Application Signals) +3. Error Analysis (CloudWatch Logs Insights) +4. Trace Analysis (Application Signals Traces) +5. Metric Correlation (CloudWatch Metrics) +6. Change Detection (CloudTrail) +7. Cost Impact (Billing & Cost Management MCP server) +8. Documentation (AWS Documentation) + +For detailed patterns on individual tools, see `log-analysis.md`, `performance-monitoring.md`, and `security-auditing.md`. + +## Quick Reference: Incident Checklist + +- [ ] Check active alarms +- [ ] Review Application Signals service health +- [ ] Query logs for errors +- [ ] Analyze traces for failures +- [ ] Check metrics for anomalies +- [ ] Review CloudTrail for changes +- [ ] Assess cost impact +- [ ] Document timeline +- [ ] Implement mitigation +- [ ] Verify recovery +- [ ] Conduct root cause analysis (see `incident-postmortem.md`) +- [ ] Write postmortem (see `incident-postmortem.md`) +- [ ] Implement preventive measures diff --git a/plugins/observability-on-aws/skills/observability-on-aws/references/log-analysis.md b/plugins/observability-on-aws/skills/observability-on-aws/references/log-analysis.md new file mode 100644 index 00000000..224252e4 --- /dev/null +++ b/plugins/observability-on-aws/skills/observability-on-aws/references/log-analysis.md @@ -0,0 +1,314 @@ +# CloudWatch Logs Insights Analysis + +## Purpose + +This reference provides guidance for using CloudWatch Logs Insights QL syntax for log analysis, troubleshooting, and data extraction via the CloudWatch MCP server. + +## MCP Server Tools + +### Primary Tools + +- `execute_log_insights_query` - Run Logs Insights queries with automatic polling +- `describe_log_groups` - Discover log groups and saved queries +- `analyze_log_group` - Detect anomalies and common patterns +- `get_logs_insight_query_results` - Retrieve results from timed-out queries +- `cancel_logs_insight_query` - Cancel running queries + +### Key Parameters for execute_log_insights_query + +- `query_string` - The Logs Insights QL query +- `log_group_names` OR `log_group_identifiers` - Target log groups (exactly one required) +- `start_time` / `end_time` - ISO 8601 format (e.g., "2025-02-06T10:00:00+00:00") +- `limit` - Max results (CRITICAL: always use to avoid context overflow) + +## Query Construction Principles + +### 1. Always Limit Results + +Every query should include a limit to avoid overwhelming context: + +``` +fields @timestamp, @message +| limit 50 +``` + +### 2. Use Pipe Syntax + +Commands are separated by pipe (`|`) characters: + +``` +fields @timestamp, @message +| filter @message like /ERROR/ +| sort @timestamp desc +| limit 100 +``` + +### 3. Built-in Fields Use @ Prefix + +CloudWatch auto-discovers these fields: + +- `@timestamp` - Log event timestamp +- `@message` - Raw log message +- `@logStream` - Log stream name +- `@log` - Log group identifier +- `@ingestionTime` - When CloudWatch received the event + +### 4. Comments Use Hash + +``` +# This is a comment +fields @timestamp, @message +| limit 50 +``` + +## Core Commands + +### fields + +Select and transform fields: + +``` +fields @timestamp, @message, @logStream +fields @timestamp, concat(@logStream, '-', @message) as combined +``` + +### filter + +Filter log events: + +``` +filter @message like /ERROR/ +filter @message like /(?i)exception/ +filter statusCode >= 400 +filter ispresent(requestId) +filter @message not like /DEBUG/ +``` + +### stats + +Aggregate statistics: + +``` +stats count(*) by bin(1h) +stats avg(duration), max(duration), min(duration) by serviceName +stats count(*) as errorCount by errorType +| sort errorCount desc +``` + +### parse + +Extract fields from messages: + +``` +# Glob pattern +parse @message "user=* action=* status=*" as user, action, status + +# Regex pattern +parse @message /user=(?\S+)/ +parse @message /duration=(?\d+)ms/ +``` + +### sort + +Order results: + +``` +sort @timestamp desc +sort duration desc +sort errorCount asc +``` + +### limit + +Cap result count: + +``` +limit 100 +``` + +### dedup + +Remove duplicates: + +``` +dedup requestId +dedup @logStream, errorType +``` + +## Common Query Patterns + +### Pattern 1: Basic Error Search + +``` +fields @timestamp, @message, @logStream +| filter @message like /ERROR/ +| sort @timestamp desc +| limit 50 +``` + +### Pattern 2: Count Errors by Type + +``` +filter @message like /ERROR/ +| parse @message /ERROR: (?[^:]+)/ +| stats count(*) as count by errorType +| sort count desc +| limit 20 +``` + +### Pattern 3: Lambda Cold Starts + +``` +filter @type = "REPORT" +| parse @message /Init Duration: (?[\d\.]+)/ +| filter ispresent(initDuration) +| stats count(*) as coldStarts, avg(initDuration) as avgInitMs by bin(1h) +| limit 100 +``` + +### Pattern 4: API Latency Analysis + +``` +filter ispresent(duration) +| stats avg(duration) as avgMs, + pct(duration, 95) as p95Ms, + pct(duration, 99) as p99Ms, + max(duration) as maxMs + by bin(5m) +| sort @timestamp desc +| limit 100 +``` + +### Pattern 5: HTTP Status Code Distribution + +``` +filter ispresent(statusCode) +| stats count(*) as requests by statusCode +| sort statusCode asc +| limit 100 +``` + +### Pattern 6: Find Slow Requests + +``` +filter duration > 1000 +| fields @timestamp, @requestId, duration, @message +| sort duration desc +| limit 25 +``` + +### Pattern 7: JSON Log Parsing + +``` +fields @timestamp, @message +| parse @message '{"level":"*","message":"*","requestId":"*"}' as level, msg, reqId +| filter level = "ERROR" +| limit 50 +``` + +### Pattern 8: Unique Error Messages + +``` +filter @message like /ERROR/ +| dedup @message +| limit 30 +``` + +### Pattern 9: Request Tracing + +``` +filter @requestId = "abc-123-def" +| fields @timestamp, @message, @logStream +| sort @timestamp asc +| limit 100 +``` + +### Pattern 10: Anomaly Detection + +``` +pattern @message +| anomaly +| limit 50 +``` + +## Functions Reference + +### String Functions + +- `strlen(field)` - String length +- `trim(field)` - Remove whitespace +- `tolower(field)` / `toupper(field)` - Case conversion +- `concat(a, b, ...)` - Concatenate strings +- `replace(field, 'old', 'new')` - Replace substring + +### Numeric Functions + +- `abs(num)` - Absolute value +- `ceil(num)` / `floor(num)` - Rounding +- `greatest(a, b, ...)` / `least(a, b, ...)` - Min/max of values +- `log(num)` / `sqrt(num)` - Math functions + +### Date/Time Functions + +- `datefloor(timestamp, period)` - Round down to period +- `dateceil(timestamp, period)` - Round up to period +- `bin(period)` - Group by time bucket (1m, 5m, 1h, 1d) +- `fromMillis(ms)` - Convert epoch ms to timestamp +- `toMillis(timestamp)` - Convert timestamp to epoch ms + +### Aggregation Functions (use with stats) + +- `count(*)` / `count(field)` - Count events +- `sum(field)` - Sum values +- `avg(field)` - Average +- `min(field)` / `max(field)` - Min/max +- `pct(field, percentile)` - Percentile (e.g., pct(duration, 95)) +- `stddev(field)` - Standard deviation +- `earliest(field)` / `latest(field)` - First/last by time + +### Conditional Functions + +- `ispresent(field)` - Check if field exists +- `isempty(field)` - Check if field is empty +- `isblank(field)` - Check if field is blank +- `coalesce(a, b, ...)` - First non-null value + +### IP Functions + +- `isValidIp(field)` - Validate IP address +- `isValidIpV4(field)` / `isValidIpV6(field)` - Validate specific version +- `isIpInSubnet(ip, subnet)` - Check subnet membership + +## Best Practices + +1. **Start with describe_log_groups** - Discover available log groups first +2. **Use narrow time ranges** - Minimize scanned data and costs +3. **Always include limit** - Prevent context window overflow +4. **Test incrementally** - Start simple, add complexity +5. **Use filterIndex for indexed fields** - Improves performance on large datasets +6. **Parse JSON early** - Extract fields before filtering when possible +7. **Use analyze_log_group** - For quick anomaly and pattern detection + +## Common Pitfalls + +1. **Missing limit** - Queries can return massive results +2. **Wrong time format** - Use ISO 8601 with timezone +3. **Case sensitivity** - Field names and regex are case-sensitive +4. **Missing ispresent()** - Filter on field existence before using it +5. **Regex escaping** - Use `/pattern/` syntax, escape special chars +6. **Large time ranges** - Can be slow and expensive + +## Query Construction Workflow + +When a user asks for log analysis: + +1. **Discover log groups** - Use `describe_log_groups` if unknown +2. **Define time range** - Ask for or suggest appropriate window +3. **Identify requirements** - What fields, filters, aggregations? +4. **Build query** - Start simple, use patterns above +5. **Add limit** - Always cap results (50-100 typical) +6. **Execute and iterate** - Refine based on results + +--- + +**Remember**: Always include a `limit` clause to avoid overwhelming the agent context. Start with 50-100 results and increase if needed. diff --git a/plugins/observability-on-aws/skills/observability-on-aws/references/observability-gap-analysis.md b/plugins/observability-on-aws/skills/observability-on-aws/references/observability-gap-analysis.md new file mode 100644 index 00000000..1849ef1c --- /dev/null +++ b/plugins/observability-on-aws/skills/observability-on-aws/references/observability-gap-analysis.md @@ -0,0 +1,90 @@ +# Codebase Observability Gap Analysis + +## Purpose + +Guidance for analyzing codebases to identify observability gaps and provide actionable recommendations for instrumentation, logging, metrics, and tracing. + +## Analysis Framework + +### 1. Logging Analysis + +**What to Check:** Error handling blocks without logging, missing structured logging (JSON format), inconsistent log levels, missing correlation IDs/request IDs, sensitive data in logs (PII, credentials), missing context (user, session, transaction). + +**Recommendations:** Use structured logging libraries (structlog, logrus, winston). Add correlation IDs to all entries. Log at appropriate levels. Include contextual metadata (service, version, environment). Sanitize sensitive data. + +### 2. Metrics Collection + +**What to Check:** Missing business metrics (orders, signups, conversions), no performance metrics (latency, throughput), missing resource utilization metrics, no error rate tracking, missing custom CloudWatch metrics, no metric dimensions. + +**Recommendations:** Instrument key business operations. Track request duration and count. Monitor error rates by type. Add custom CloudWatch metrics for business KPIs. Use metric dimensions (service, endpoint, status). + +### 3. Distributed Tracing + +**What to Check:** Missing OpenTelemetry SDK integration, no trace context propagation (W3C Trace Context), missing spans for external calls, no span attributes for business context, no sampling strategy, legacy X-Ray SDK without OTEL migration path. + +**Recommendations:** Use OpenTelemetry for instrumentation (vendor-neutral). Deploy ADOT as the collector. Enable AWS Application Signals for automatic APM. Propagate trace context across service boundaries. Implement adaptive sampling for high-volume services. + +### 4. Error Handling + +**What to Check:** Silent failures (empty catch blocks), generic error messages, missing error context, no error categorization, missing retry logic, no circuit breaker patterns. + +**Recommendations:** Log all errors with full context. Use specific error types/classes. Include stack traces. Categorize errors (transient, permanent, user). Implement exponential backoff. Add circuit breakers for external dependencies. + +### 5. Health Checks & Readiness + +**What to Check:** Missing health check endpoints, no readiness probes, incomplete dependency checks, no graceful shutdown handling. + +**Recommendations:** Implement `/health` and `/ready` endpoints. Include version and build info. Implement graceful shutdown. Add startup probes for slow-starting services. + +## Analysis Workflow + +1. **Scan Entry Points** - API endpoints/routes, Lambda handlers, message queue consumers, scheduled jobs +2. **Identify Critical Paths** - Authentication, payment processing, data mutations, external service calls +3. **Check Instrumentation** - Logging at entry/exit, error handling coverage, metrics emission, trace propagation +4. **Generate Report** - List gaps by severity, provide code examples, estimate effort, prioritize recommendations + +## Common Observability Gaps + +**Critical:** No error logging, missing trace context, silent failures (empty catch blocks), no health checks, sensitive data exposure in logs. + +**High:** Unstructured logging (print/console.log), missing correlation IDs, no custom CloudWatch metrics, generic error messages, missing request/response logging. + +**Medium:** Inconsistent log levels, missing business metrics, no sampling strategy, insufficient debugging context, logs not centralized. + +**Low:** Verbose DEBUG output in production, missing log rotation, no log retention policy, inconsistent naming conventions, missing runbooks. + +## Actionable Recommendations Template + +For each gap identified, provide: + +- **Gap:** Description +- **Severity:** Critical/High/Medium/Low +- **Location:** File:Line or Pattern +- **Impact:** What problems this causes +- **Recommendation:** Specific fix with before/after code +- **Effort:** Hours/Days estimate +- **Priority:** 1-5 + +## Integration with AWS Observability + +1. **CloudWatch Logs** - Configure log groups per service, set retention policies, enable Log Insights, use metric filters +2. **CloudWatch Metrics** - Define custom metrics with limited dimensions, set up dashboards and alarms, use EMF for Lambda +3. **OpenTelemetry + ADOT + Application Signals** - Integrate OTEL SDK, deploy ADOT Collector, enable Application Signals for automatic APM, configure sampling rules, export to X-Ray/CloudWatch +4. **Application Signals (APM)** - Automatic service discovery, SLO tracking and error budgets, service health monitoring (P50/P90/P99), correlation across traces/metrics/logs + +## Best Practices Checklist + +- [ ] Structured logging with JSON format +- [ ] Correlation IDs in all logs +- [ ] Error logging with stack traces +- [ ] Metrics for key operations +- [ ] Distributed tracing with OpenTelemetry +- [ ] Health check endpoints +- [ ] Graceful error handling +- [ ] No sensitive data in logs +- [ ] Log levels used appropriately +- [ ] Business and performance metrics tracked +- [ ] Trace context propagated +- [ ] Sampling strategy defined + +For language-specific code patterns and examples, see `observability-language-patterns.md`. diff --git a/plugins/observability-on-aws/skills/observability-on-aws/references/observability-language-patterns.md b/plugins/observability-on-aws/skills/observability-on-aws/references/observability-language-patterns.md new file mode 100644 index 00000000..b60031a8 --- /dev/null +++ b/plugins/observability-on-aws/skills/observability-on-aws/references/observability-language-patterns.md @@ -0,0 +1,103 @@ +# Observability Language Patterns + +Language-specific observability patterns with GOOD/BAD code examples. + +## Python (structlog) + +```python +# GOOD: Structured logging with context +import structlog +logger = structlog.get_logger() +def process_order(order_id, user_id): + log = logger.bind(order_id=order_id, user_id=user_id) + try: + log.info("processing_order_started") + log.info("processing_order_completed", duration_ms=duration) + except Exception as e: + log.error("processing_order_failed", error=str(e), exc_info=True) + raise +# BAD: Unstructured, no context, silent failure +def process_order(order_id, user_id): + try: print("Processing order") + except: pass +``` + +## Java (SLF4J + MDC) + +```java +// GOOD: Structured logging with MDC +public void processOrder(String orderId, String userId) { + MDC.put("orderId", orderId); MDC.put("userId", userId); + try { + logger.info("Processing order started"); + } catch (Exception e) { + logger.error("Processing order failed", e); throw e; + } finally { MDC.clear(); } +} +// BAD: No context, silent failure +public void processOrder(String orderId, String userId) { + try { System.out.println("Processing"); } + catch (Exception e) { /* silent */ } +} +``` + +## JavaScript/TypeScript (Winston) + +```javascript +// GOOD: Structured logging with Winston +const logger = winston.createLogger({ + format: winston.format.json(), + defaultMeta: { service: 'order-service' }, +}); +async function processOrder(orderId, userId) { + const log = logger.child({ orderId, userId }); + try { + log.info('Processing order started'); + } catch (error) { + log.error('Processing order failed', { error: error.message, stack: error.stack }); + throw error; + } +} +// BAD: Console logging, silent failure +async function processOrder(orderId, userId) { + try { console.log('Processing order'); } + catch (error) { /* silent */ } +} +``` + +## Go (zerolog) + +```go +// GOOD: Structured logging with zerolog +func processOrder(orderID, userID string) error { + logger := log.With().Str("order_id", orderID).Str("user_id", userID).Logger() + logger.Info().Msg("Processing order started") + if err := doWork(); err != nil { + logger.Error().Err(err).Msg("Processing order failed") + return err + } + return nil +} +// BAD: No structure, no error logging +func processOrder(orderID, userID string) error { + fmt.Println("Processing order") + if err := doWork(); err != nil { return err } + return nil +} +``` + +## Cost Optimization + +**Logging:** Production level INFO/WARN. Sample high-throughput (1 in 100). CloudWatch retention 7-30 days. EMF for Lambda. Avoid hot-path logging. +**Metrics:** Max 10 dimensions per metric. Aggregate at source. Standard 1-min resolution. Use metric math in dashboards. +**Tracing:** Head-based sampling. Adaptive rates (higher for errors). 1% high-volume, 100% low-volume. Skip health checks. ADOT for local aggregation. + +## Example Analysis Output + +```markdown +# Observability Analysis: 45 files | Critical: 3 | High: 8 | Medium: 12 + +## Critical: Missing Error Logging in Payment Handler + +Location: src/handlers/payment.py:45-60 | Fix: Add structured error logging +``` diff --git a/plugins/observability-on-aws/skills/observability-on-aws/references/performance-monitoring.md b/plugins/observability-on-aws/skills/observability-on-aws/references/performance-monitoring.md new file mode 100644 index 00000000..dfada483 --- /dev/null +++ b/plugins/observability-on-aws/skills/observability-on-aws/references/performance-monitoring.md @@ -0,0 +1,99 @@ +# Application Signals Performance Monitoring + +## Purpose + +Guidance for using AWS CloudWatch Application Signals to monitor application performance, health, and dependencies. For trace analysis and query patterns, see `performance-traces.md`. For SLO configuration and troubleshooting, see `performance-slos.md`. + +## Application Signals Overview + +Application Signals provides automatic instrumentation with service-level metrics (latency, error rate, volume), distributed tracing via X-Ray, service maps, SLOs, and automatic discovery requiring no code changes. + +## Core Concepts + +- **Services**: Logical components (microservices, Lambda functions, API endpoints, DB connections) +- **Operations**: Actions within a service (API endpoints, DB queries, external calls, queue ops) +- **SLOs**: Expected performance targets (availability %, latency thresholds, custom metrics) +- **Traces**: End-to-end request flows with trace IDs, spans, annotations, and subsegments + +## Common Monitoring Tasks + +1. **View Service Health**: List services by time range/status; check request count, error rate, P99 latency +2. **Analyze Dependencies**: Get service map over 24h; look for downstream/upstream bottlenecks +3. **Investigate Issues**: Compare current vs baseline latency, filter slow traces, check dependencies +4. **Set Up SLOs**: Define availability (99.9% over 30d) and latency targets (95% under 500ms over 7d) + +## Application Signals MCP Server Tools + +**Primary Audit Tools**: `audit_services` (service health with wildcards), `audit_slos` (SLO compliance), `audit_service_operations` (operation-level analysis) + +**Service Discovery**: `list_monitored_services`, `get_service_detail`, `list_service_operations` (max 24h lookback) + +**SLO Management**: `list_slos`, `get_slo`, `list_slis` (legacy SLI breach summary) + +**Metrics**: `query_service_metrics` (Latency, Error, Fault for a service) + +**Trace & Log Analysis**: `search_transaction_spans` (100% sampled, Logs Insights), `query_sampled_traces` (5% sampled, X-Ray filters) + +**Canary, Change Events & Enablement**: `analyze_canary_failures`, `list_change_events` (`comprehensive_history=True` for ListEntityEvents API or `False` for ListServiceStates), `get_enablement_guide` + +## Target Format Reference + +**All services:** + +```json +[{ "Type": "service", "Data": { "Service": { "Type": "Service", "Name": "*" } } }] +``` + +**Wildcard pattern:** + +```json +[{ "Type": "service", "Data": { "Service": { "Type": "Service", "Name": "*payment*" } } }] +``` + +**Specific service:** + +```json +[{ + "Type": "service", + "Data": { + "Service": { "Type": "Service", "Name": "checkout-service", "Environment": "eks:prod-cluster" } + } +}] +``` + +**All SLOs:** + +```json +[{ "Type": "slo", "Data": { "Slo": { "SloName": "*" } } }] +``` + +**Operation targets:** + +```json +[{ + "Type": "service_operation", + "Data": { + "ServiceOperation": { + "Service": { "Type": "Service", "Name": "*payment*" }, + "Operation": "*GET*", + "MetricType": "Latency" + } + } +}] +``` + +MetricType options: `Latency`, `Availability`, `Fault`, `Error` + +## Auditor Selection Guide + +| Scenario | Auditors | +| ------------------------ | ---------------------------------- | +| Quick health check | Default (omit parameter) | +| Root cause analysis | `all` | +| SLO breach investigation | `all` | +| Error investigation | `log,trace` | +| Dependency issues | `dependency_metric,trace` | +| Find outlier hosts | `top_contributor,operation_metric` | +| Quota monitoring | `service_quota,operation_metric` | + +The 7 auditor types: `slo`, `operation_metric`, `trace`, `log`, `dependency_metric`, `top_contributor`, `service_quota` diff --git a/plugins/observability-on-aws/skills/observability-on-aws/references/performance-slos.md b/plugins/observability-on-aws/skills/observability-on-aws/references/performance-slos.md new file mode 100644 index 00000000..f93e0155 --- /dev/null +++ b/plugins/observability-on-aws/skills/observability-on-aws/references/performance-slos.md @@ -0,0 +1,95 @@ +# Application Signals SLOs & Troubleshooting + +## Purpose + +SLO configuration, performance metrics, alerting, and troubleshooting workflows for Application Signals. + +## Performance Metrics Reference + +- **Request**: RequestCount, SuccessCount (< 400), FaultCount (>= 500), ErrorCount (400-499) +- **Latency**: Duration, P50, P90, P95, P99, P99.9 +- **Rate**: ErrorRate = (ErrorCount / RequestCount) * 100, FaultRate, SuccessRate +- **Throughput**: RequestsPerSecond = RequestCount / time_window, BytesIn, BytesOut + +## SLO Configuration Best Practices + +**Availability**: High-availability = 99.95% over 30 days (alert when error budget < 20%). Standard = 99.9% over 7 days. + +**Latency**: User-facing API = 95% of requests < 500ms over 7 days (P95). Background processing = 99% of jobs < 30s over 30 days (P99). + +### Custom SLOs + +**Data freshness**: 99% of updates < 5 minutes old over 24h, custom metric (data_age). + +## Performance Troubleshooting Workflows + +### Workflow 1: High Error Rate + +1. Identify service with high error rate and get error breakdown by operation +2. Sample error traces; analyze patterns: 4xx = input/auth issues, 5xx = bugs/dependency failures +3. Check dependencies, recent deployments, and CloudWatch Logs + +``` +fields @timestamp, endpoint, statusCode, errorType +| filter statusCode >= 400 +| stats count(*) as errorCount by endpoint, statusCode +| sort errorCount desc +| limit 20 +``` + +### Workflow 2: Increased Latency + +1. Compare current P95/P99 vs baseline; identify slow operations +2. Analyze slow traces (which spans, downstream services, DB queries) +3. Check for increased load, code changes, dependency issues, resource constraints + +``` +fields @timestamp, endpoint, duration +| filter ispresent(duration) +| stats count(*) as requestCount, + avg(duration) as avgDuration, + pct(duration, 95) as p95, + pct(duration, 99) as p99, + max(duration) as maxDuration + by endpoint +| sort p95 desc +| limit 20 +``` + +### Workflow 3: SLO Breach Investigation + +1. Identify which SLO was breached and when +2. Get metrics during breach period (error rate spike, latency increase, traffic surge) +3. Check correlated events: deployments (CloudTrail), infrastructure changes, dependency failures +4. Review traces from breach period, document root cause, update runbooks + +## Alerting Configuration + +**Critical**: ErrorRate > 5% (5 min, 3/5 datapoints, page on-call) | P99 > 2000ms (10 min, 2/2, notify team) | Error budget < 20% (1 hour, notify + feature freeze) + +**Warning**: ErrorRate > 1% (15 min, notify channel) | P95 > 1000ms (15 min, log to monitoring) + +## Best Practices + +- **Instrumentation**: X-Ray SDK for custom instrumentation; environment in service names; consistent naming +- **SLO Management**: Start realistic (99% before 99.99%); align with business needs; review quarterly +- **Trace Sampling**: Adaptive for high-volume; always capture errors; 100% during incidents +- **Metric Collection**: Monitor P50/P95/P99; track 4xx vs 5xx; measure volume trends +- **Baselines**: Establish normal latency baselines; document expected error rates; track seasonal patterns + +## Common Performance Patterns + +| Pattern | Symptom | Solution | +| --------------------- | --------------------------- | -------------------------------------- | +| Cold Start Impact | High P99, large P50-P99 gap | Provisioned concurrency, keep-alive | +| DB Connection Pooling | Latency increases with load | Connection pooling, query optimization | +| Cascading Failures | Multiple services erroring | Circuit breakers, timeouts, bulkheads | +| Cache Invalidation | Periodic latency spikes | Optimize cache strategy, cache warming | +| Traffic Bursts | Error rate up during peaks | Auto-scaling, rate limiting, queues | + +## Quick Reference Commands + +- **Service health**: `audit_services(...)` with `Name: "*"` +- **Slow request**: `search_transaction_spans(...)` with `FILTER ... and duration > 5000 | LIMIT 20` +- **Dependencies**: `audit_services(..., auditors="dependency_metric,trace")` +- **SLO monitor**: `audit_slos(...)` then `get_slo(...)` then if breached: add `auditors="all"` diff --git a/plugins/observability-on-aws/skills/observability-on-aws/references/performance-traces.md b/plugins/observability-on-aws/skills/observability-on-aws/references/performance-traces.md new file mode 100644 index 00000000..0883d1c2 --- /dev/null +++ b/plugins/observability-on-aws/skills/observability-on-aws/references/performance-traces.md @@ -0,0 +1,94 @@ +# Application Signals Trace Analysis + +## Purpose + +Transaction search patterns, X-Ray filter expressions, and distributed tracing analysis for Application Signals performance monitoring. + +## Transaction Search Query Patterns + +Use with `search_transaction_spans` (100% sampled, queries `aws/spans` log group): + +**Error analysis:** + +``` +FILTER attributes.aws.local.service = "service-name" + and attributes.http.status_code >= 400 +| STATS count() as error_count by attributes.aws.local.operation +| SORT error_count DESC +| LIMIT 20 +``` + +**Latency analysis:** + +``` +FILTER attributes.aws.local.service = "service-name" +| STATS avg(duration) as avg_latency, pct(duration, 99) as p99_latency + by attributes.aws.local.operation +| SORT p99_latency DESC +| LIMIT 20 +``` + +**Dependency calls:** + +``` +FILTER attributes.aws.local.service = "service-name" +| STATS count() as call_count, avg(duration) as avg_latency + by attributes.aws.remote.service, attributes.aws.remote.operation +| SORT call_count DESC +| LIMIT 20 +``` + +**GenAI token usage:** + +``` +FILTER attributes.aws.local.service = "service-name" + and attributes.aws.remote.operation = "InvokeModel" +| STATS sum(attributes.gen_ai.usage.output_tokens) as total_tokens + by attributes.gen_ai.request.model, bin(1h) +``` + +## X-Ray Filter Expressions + +Use with `query_sampled_traces` (5% sampled): + +``` +service("service-name"){fault = true} +service("service-name") AND duration > 5 +annotation[aws.local.operation]="GET /api/orders" +http.status = 500 +service("api"){fault = true} AND annotation[aws.local.operation]="POST /checkout" +``` + +## Pagination (`next_token`) Guidance + +Wildcard patterns process services/SLOs in batches (default: 5 per call). First call returns findings + `next_token` if more results exist. Continue with same parameters + `next_token` until no token is returned. + +## Tool Usage Patterns + +| Pattern | Steps | +| --------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- | +| Health Check | `audit_services(...)` with `Name: "*"` then add `auditors="all"` for issues | +| Latency Investigation | `audit_service_operations(...)` with `MetricType: Latency` then `query_service_metrics(...)` then `search_transaction_spans(...)` with latency query above | +| Dependency Analysis | `audit_services(..., auditors="dependency_metric,trace")` then `query_service_metrics(...)` per dependency | +| SLO Monitoring | `audit_slos(...)` with `SloName: "*"` then `get_slo(...)` then `audit_slos(..., auditors="all")` for breaches | + +## Distributed Tracing Analysis + +**What to look for**: Long spans (disproportionate time), sequential calls (parallelize), repeated operations (cache), external dependency latency, error spans (failed operations/exceptions). + +**Common patterns**: N+1 queries (fix: batch/cache), sequential external calls (fix: parallelize), long cold starts > 1s (fix: provisioned concurrency), downstream timeouts (fix: circuit breaker/timeout tuning). + +## Integration with CloudWatch Logs + +Correlate Application Signals metrics with logs using Logs Insights: + +``` +fields @timestamp, @message, requestId, traceId, duration, level +| filter duration > 1000 +| sort duration desc +| limit 100 +``` + +## Integration with CloudTrail + +Follow CloudTrail data source priority (see `cloudtrail-data-source-selection.md`): Lake event data stores (preferred), CloudWatch Logs integration, or Lookup Events API (fallback). diff --git a/plugins/observability-on-aws/skills/observability-on-aws/references/prerequisites.md b/plugins/observability-on-aws/skills/observability-on-aws/references/prerequisites.md new file mode 100644 index 00000000..aa1b600d --- /dev/null +++ b/plugins/observability-on-aws/skills/observability-on-aws/references/prerequisites.md @@ -0,0 +1,35 @@ +# Prerequisites and Configuration + +## Requirements + +1. **AWS CLI configured** with credentials (`aws configure` or `~/.aws/credentials`) +2. **Python 3.10+** and `uv` installed +3. **Application Signals enabled** in your AWS account when applicable + +## MCP Server Configuration + +After installing this plugin, update the `env` section for each stdio MCP server with your AWS profile and region: + +```json +"env": { + "AWS_PROFILE": "your-profile-name", + "AWS_REGION": "us-east-1", + "FASTMCP_LOG_LEVEL": "ERROR" +} +``` + +Servers requiring AWS credentials: `awslabs.cloudwatch-mcp-server`, `awslabs.cloudwatch-applicationsignals-mcp-server`, `awslabs.cloudtrail-mcp-server`, `awslabs.billing-cost-management-mcp-server`. + +**Default:** Uses `default` AWS profile and `us-east-1` region. + +## Required IAM Permissions (read-only, least-privilege) + +- **CloudWatch Metrics & Alarms**: `cloudwatch:GetMetricData`, `cloudwatch:GetMetricStatistics`, `cloudwatch:ListMetrics`, `cloudwatch:DescribeAlarms`, `cloudwatch:DescribeAlarmsForMetric`, `cloudwatch:DescribeAlarmHistory`, `cloudwatch:DescribeAnomalyDetectors` +- **CloudWatch Logs**: `logs:DescribeLogGroups`, `logs:DescribeLogStreams`, `logs:GetLogEvents`, `logs:FilterLogEvents`, `logs:StartQuery`, `logs:StopQuery`, `logs:GetQueryResults`, `logs:DescribeQueries` +- **X-Ray**: `xray:BatchGetTraces`, `xray:GetTraceSummaries`, `xray:GetTraceGraph`, `xray:GetServiceGraph`, `xray:GetTimeSeriesServiceStatistics` +- **CloudTrail (events + Lake queries)**: `cloudtrail:LookupEvents`, `cloudtrail:DescribeTrails`, `cloudtrail:GetTrail`, `cloudtrail:ListTrails`, `cloudtrail:GetEventSelectors`, `cloudtrail:ListEventDataStores`, `cloudtrail:GetEventDataStore`, `cloudtrail:StartQuery`, `cloudtrail:DescribeQuery`, `cloudtrail:GetQueryResults`, `cloudtrail:ListQueries`, `cloudtrail:CancelQuery` +- **Application Signals**: `application-signals:GetService`, `application-signals:ListServices`, `application-signals:ListServiceOperations`, `application-signals:GetServiceLevelObjective`, `application-signals:ListServiceLevelObjectives`, `application-signals:BatchGetServiceLevelObjectiveBudgetReport` +- **Billing & Cost Management (read-only)**: `ce:GetCostAndUsage`, `ce:GetCostAndUsageWithResources`, `ce:GetCostForecast`, `ce:GetCostCategories`, `ce:GetDimensionValues`, `ce:GetTags`, `ce:GetAnomalies`, `ce:GetAnomalyMonitors`, `ce:GetReservationUtilization`, `ce:GetSavingsPlansUtilization`, `ce:DescribeCostCategoryDefinition`, `cost-optimization-hub:ListRecommendations`, `cost-optimization-hub:GetRecommendation`, `compute-optimizer:GetEnrollmentStatus`, `compute-optimizer:GetRecommendationSummaries`, `compute-optimizer:GetEC2InstanceRecommendations`, `compute-optimizer:GetAutoScalingGroupRecommendations`, `compute-optimizer:GetLambdaFunctionRecommendations`, `budgets:ViewBudget`, `pricing:GetProducts`, `pricing:DescribeServices`, `freetier:GetFreeTierUsage` +- `synthetics:GetCanary`, `synthetics:GetCanaryRuns` for canary analysis +- `s3:GetObject`, `s3:ListBucket` for canary artifacts +- `iam:GetRole`, `iam:ListAttachedRolePolicies`, `iam:GetPolicy`, `iam:GetPolicyVersion` for enablement guides diff --git a/plugins/observability-on-aws/skills/observability-on-aws/references/security-auditing.md b/plugins/observability-on-aws/skills/observability-on-aws/references/security-auditing.md new file mode 100644 index 00000000..3286a8f9 --- /dev/null +++ b/plugins/observability-on-aws/skills/observability-on-aws/references/security-auditing.md @@ -0,0 +1,97 @@ +# CloudTrail Security Auditing + +## Purpose + +Guidance for accessing and analyzing CloudTrail audit data for security auditing, compliance monitoring, and governance analysis. + +## Prerequisites and Data Source Selection + +Follow the priority order defined in `cloudtrail-data-source-selection.md`: + +1. **CloudTrail Lake** (preferred): Use `list_event_data_stores` then `lake_query` for SQL-based analysis. Configurable retention (up to 2,557 days / ~7 years). +2. **CloudWatch Logs**: Use `describe_log_groups` to find groups matching "cloudtrail". Use `execute_log_insights_query` for real-time monitoring. +3. **Lookup Events API** (fallback): Use `lookup_events` from CloudTrail MCP server. Limited to last 90 days, 50 results per call. + +## When to Load This Reference + +Load when the user needs to investigate security incidents, track API activity, perform compliance audits, monitor IAM changes, or detect unauthorized access. For investigation queries, see `security-investigations.md`. For monitoring and alerting, see `security-monitoring.md`. + +## CloudTrail Overview + +CloudTrail logs all API calls in your AWS account for governance, compliance, and audit. Supports real-time analysis, CloudWatch Logs Insights queries, cross-service correlation, multi-region/multi-account, and configurable retention. + +## Core Concepts + +### Event Types + +- **Management Events**: Control plane operations (CreateBucket, RunInstances, CreateUser) +- **Data Events**: Data plane operations (GetObject, PutObject, Invoke) +- **Insights Events**: ML-detected unusual API activity (rate and error rate anomalies) + +### Event Structure + +```json +{ + "eventTime": "2024-12-08T10:30:00Z", + "eventName": "CreateBucket", + "eventSource": "s3.amazonaws.com", + "userIdentity": { "type": "IAMUser", "userName": "alice" }, + "sourceIPAddress": "203.0.113.0", + "userAgent": "aws-cli/2.0.0", + "requestParameters": { "bucketName": "my-new-bucket" }, + "errorCode": null +} +``` + +## Querying CloudTrail Data + +### CloudTrail Lake (Priority 1) + +```sql +SELECT eventTime, eventName, userIdentity.userName, sourceIPAddress +FROM +WHERE eventName IN ('DeleteBucket', 'TerminateInstances') + AND eventTime > timestamp '2024-01-01 00:00:00' +ORDER BY eventTime DESC +LIMIT 50 +``` + +### CloudWatch Logs (Priority 2) + +``` +fields eventTime, eventName, userIdentity.userName, sourceIPAddress +| filter eventName = "DeleteBucket" or eventName = "TerminateInstances" +| sort eventTime desc +| limit 50 +``` + +### Lookup Events API (Priority 3) + +``` +lookup_events( + LookupAttributes=[{ 'AttributeKey': 'EventName', 'AttributeValue': 'DeleteBucket' }], + StartTime='2024-10-01T00:00:00Z', # Must be within 90 days + EndTime='2024-12-31T23:59:59Z', + MaxResults=50 +) +``` + +## Quick Reference + +### Common Event Names + +- **IAM**: CreateUser, DeleteUser, AttachUserPolicy, CreateAccessKey +- **EC2**: RunInstances, TerminateInstances, AuthorizeSecurityGroupIngress +- **S3**: CreateBucket, DeleteBucket, PutBucketPolicy +- **RDS**: CreateDBInstance, DeleteDBInstance, ModifyDBInstance +- **Lambda**: CreateFunction, DeleteFunction, UpdateFunctionCode + +### User Identity Types + +- **IAMUser**: Standard IAM user | **AssumedRole**: Role via STS | **Root**: Root account +- **FederatedUser**: Federated identity | **AWSService**: AWS service acting on your behalf + +### Error Codes + +- **AccessDenied**: Permission denied | **UnauthorizedOperation**: Not authorized +- **InvalidParameter**: Invalid request parameter | **ResourceNotFound**: Resource doesn't exist diff --git a/plugins/observability-on-aws/skills/observability-on-aws/references/security-investigations.md b/plugins/observability-on-aws/skills/observability-on-aws/references/security-investigations.md new file mode 100644 index 00000000..8987e8a5 --- /dev/null +++ b/plugins/observability-on-aws/skills/observability-on-aws/references/security-investigations.md @@ -0,0 +1,99 @@ +# Security Investigations + +## Purpose + +Investigation workflows for security incidents, compliance audits, and resource change tracking using CloudTrail data via CloudWatch Logs Insights. + +## Security Incident Investigation + +``` +# Identify failed access attempts +fields eventTime, eventName, userIdentity.userName, sourceIPAddress, errorCode, errorMessage +| filter errorCode = "AccessDenied" or errorCode = "UnauthorizedOperation" +| sort eventTime desc +| limit 100 + +# Trace specific user activity +fields eventTime, eventName, requestParameters, responseElements +| filter userIdentity.userName = "suspect-user" +| sort eventTime desc +| limit 100 + +# Check IAM changes before incident +fields eventTime, eventName, userIdentity.userName, requestParameters +| filter eventSource = "iam.amazonaws.com" +| filter eventName like /Create|Delete|Update|Attach|Detach/ +| sort eventTime desc +| limit 50 +``` + +## Compliance Audit + +``` +# List all IAM changes in period +fields eventTime, eventName, userIdentity.userName, requestParameters +| filter eventSource = "iam.amazonaws.com" +| filter eventName like /Create|Delete|Update|Attach|Detach/ +| sort eventTime desc +| limit 100 + +# Track privileged actions +fields eventTime, eventName, userIdentity.userName, requestParameters +| filter eventName = "AssumeRole" or eventName = "GetFederationToken" +| sort eventTime desc +| limit 100 +``` + +## Resource Change Tracking + +``` +# Find specific resource changes (replace with actual resource ARN) +fields eventTime, eventName, userIdentity.userName, requestParameters +| filter requestParameters like /arn:aws:s3:::my-bucket/ +| sort eventTime desc +| limit 50 + +# Track configuration changes for a service +fields eventTime, eventName, userIdentity.userName, requestParameters +| filter eventSource = "ec2.amazonaws.com" +| filter eventName like /Update|Modify|Put/ +| sort eventTime desc +| limit 100 +``` + +## Investigation Workflows + +``` +# Privilege escalation check +fields eventTime, eventName, requestParameters.policyDocument as policy +| filter userIdentity.userName = "suspect-user" +| filter eventName in ["CreatePolicy", "AttachUserPolicy", "PutUserPolicy", "AttachRolePolicy", "PutRolePolicy"] +| sort eventTime asc +| limit 100 + +# Summarize resources accessed by suspect user +fields eventName, eventSource +| filter userIdentity.userName = "suspect-user" +| stats count() as accessCount by eventName, eventSource +| sort accessCount desc +| limit 100 + +# Identify who deleted a resource (replace event name and resource) +fields eventTime, userIdentity.userName, userIdentity.arn, sourceIPAddress, userAgent +| filter eventName = "DeleteBucket" +| filter requestParameters.bucketName = "my-critical-bucket" +| sort eventTime desc +| limit 10 +``` + +## Compliance Audit Tasks + +``` +# Users created, access keys, and KMS key operations in audit period +fields eventTime, eventName, userIdentity.userName, requestParameters +| filter eventName in ["CreateUser", "CreateAccessKey", "CreateKey", "ScheduleKeyDeletion", "DisableKey"] +| sort eventTime asc +| limit 500 +``` + +To correlate CloudTrail with application logs: query for security events, extract timestamps, then query application logs for the same period (Logs Insights does not support JOINs across log groups). diff --git a/plugins/observability-on-aws/skills/observability-on-aws/references/security-monitoring.md b/plugins/observability-on-aws/skills/observability-on-aws/references/security-monitoring.md new file mode 100644 index 00000000..262eb825 --- /dev/null +++ b/plugins/observability-on-aws/skills/observability-on-aws/references/security-monitoring.md @@ -0,0 +1,91 @@ +# Security Monitoring + +## Purpose + +Security monitoring use cases and alerting patterns using CloudTrail data via CloudWatch Logs Insights. + +## Monitoring Use Cases + +### Unauthorized Access Detection + +``` +fields eventTime, eventName, userIdentity.userName, sourceIPAddress, errorCode +| filter (eventName = "ConsoleLogin" or eventName = "AssumeRole") +| filter errorCode like /AccessDenied|AuthFailure|UnauthorizedOperation/ +| stats count() as failedAttempts by userIdentity.userName, sourceIPAddress +| sort failedAttempts desc +| limit 50 +``` + +### IAM Changes Audit + +``` +fields eventTime, eventName, userIdentity.userName as actor, requestParameters.userName as targetUser, sourceIPAddress +| filter eventSource = "iam.amazonaws.com" +| filter eventName in ["CreateUser", "DeleteUser", "CreateRole", "DeleteRole", "PutUserPolicy", "PutRolePolicy", "AttachUserPolicy", "AttachRolePolicy", "CreateAccessKey", "DeactivateMFADevice"] +| sort eventTime desc +| limit 100 +``` + +### Resource Deletion Tracking + +``` +fields eventTime, eventName, userIdentity.userName, requestParameters, sourceIPAddress +| filter eventName like /Delete|Terminate/ +| sort eventTime desc +| limit 100 +``` + +### Security Group Changes + +``` +fields eventTime, eventName, userIdentity.userName as user, requestParameters.groupId as sgId, sourceIPAddress +| filter eventSource = "ec2.amazonaws.com" +| filter eventName in ["AuthorizeSecurityGroupIngress", "AuthorizeSecurityGroupEgress", "RevokeSecurityGroupIngress", "RevokeSecurityGroupEgress"] +| sort eventTime desc +| limit 100 +``` + +### Root Account Activity + +``` +fields eventTime, eventName, eventSource, sourceIPAddress, userIdentity.sessionContext.attributes.mfaAuthenticated as mfaUsed +| filter userIdentity.type = "Root" +| sort eventTime desc +| limit 100 +``` + +### Cross-Account Access + +``` +fields eventTime, eventName, userIdentity.accountId, recipientAccountId, sourceIPAddress +| filter eventName = "AssumeRole" +| filter userIdentity.accountId != recipientAccountId +| sort eventTime desc +| limit 100 +``` + +## Critical Alerting Patterns + +Use the Root Account and Security Group queries above as CloudWatch Alarms. Additional critical alerts: + +``` +# IAM policy modifications → review changes, alert if suspicious +fields eventTime, eventName, userIdentity.userName as user, requestParameters +| filter eventName in ["PutUserPolicy", "PutRolePolicy", "AttachUserPolicy", "AttachRolePolicy"] +| sort eventTime desc +| limit 50 + +# KMS key deletion scheduled → confirm key is no longer needed +fields eventTime, userIdentity.userName as user, requestParameters.keyId as keyId +| filter eventName = "ScheduleKeyDeletion" +| sort eventTime desc +| limit 50 +``` + +## Best Practices + +- Enable CloudTrail in all regions/accounts with log file validation and CloudWatch Logs integration +- Set up CloudWatch Alarms and metric filters for critical events; automate threat response +- Configure retention policies; archive to S3/Glacier with Object Lock for immutability +- Restrict CloudTrail log group access; enable encryption; integrates with Application Signals, Security Hub, and GuardDuty