fix: update of the logic of correlation of alerts and construction of the historical context based on counts

yllada · yllada · commit da77066345d9 · 2025-05-04T00:04:13.000-04:00
diff --git a/soc-ai/configurations/const.go b/soc-ai/configurations/const.go
@@ -61,7 +61,7 @@ var (
 	}
 	GPT_INSTRUCTION     = "You are an expert security engineer. Perform a deep analysis of an alert created by a SIEM and the logs related to it. Determine if the alert could be an actual potential threat or not and explain why. Provide a description that shows a deep understanding of the alert based on a deep analysis of its logs and estimate the risk to the systems affected. Classify the alert in the following manner: if the alert information is sufficient to determine that the security, availability, confidentiality, or integrity of the systems has being compromised, then classify it as \"possible incident\". If the alert does not pose a security risk to the organization or has no security relevance, classify it as \"possible false positive\". If the alert does not pose an imminent risk to the systems, requires no urgent action from an administrator, or requires not urgent review by an administrator, it should be classified as a \"standard alert\". You will also provide context-specific instructions for remediation, mitigation, or further investigation, related to the alert and logs analyzed. Your answer should be provided using the following JSON format and the total number of characters in your answer must not exceed 1500 words. Your entire answer must be inside this json format. {\"activity_id\":\"<activity_id>\",\"classification\":\"<classification>\",\"reasoning\":[\"<deep_reasoning>\"],\"nextSteps\":[{\"step\":1,\"action\":\"<action_1>\",\"details\":\"<action_1_details>\"},{\"step\":2,\"action\":\"<action_2>\",\"details\":\"<action_2_details>\"},{\"step\":3,\"action\":\"<action_3>\"]}Ensure that your entire answer adheres to the provided JSON format. The response should be valid JSON syntax and schema."
 	GPT_FALSE_POSITIVE  = "This alert is categorized as a potential false positive due to two key factors. Firstly, it originates from an automated system, which may occasionally produce alerts without direct human validation. Additionally, the absence of any correlated logs further raises suspicion, as a genuine incident typically leaves a trail of relevant log entries. Hence, the combination of its system-generated nature and the lack of associated logs suggests a likelihood of being a false positive rather than a genuine security incident."
-	CORRELATION_CONTEXT = "\n\nAlert Context: The current alert has historical correlation with previous alerts:\n%s"
+	CORRELATION_CONTEXT = "\n\nThe current alert has historical correlation with previous alerts:\n%s"
 )
 
 func GetInternalKey() string {
diff --git a/soc-ai/elastic/alerts.go b/soc-ai/elastic/alerts.go
@@ -4,6 +4,7 @@ import (
 	"encoding/json"
 	"fmt"
 	"net/http"
+	"sort"
 	"strings"
 
 	"github.com/utmstack/soc-ai/configurations"
@@ -67,136 +68,195 @@ func ChangeAlertStatus(id string, status int, observations string) error {
 	return nil
 }
 
+type AlertCounts struct {
+	Incidents     int
+	FalsePositive int
+	Standard      int
+	Unclassified  int
+}
+
+type MatchTypeCounts struct {
+	SourceIP        AlertCounts
+	DestinationIP   AlertCounts
+	SourceUser      AlertCounts
+	DestinationUser AlertCounts
+}
+
 type AlertCorrelation struct {
-	CurrentAlert    schema.Alert
-	RelatedAlerts   []schema.Alert
-	Classifications []string
+	CurrentAlert  schema.Alert
+	RelatedAlerts []schema.Alert
+	Counts        MatchTypeCounts
 }
 
-func GetRelatedAlerts() ([]schema.Alert, error) {
-	result, err := ElasticSearch(configurations.ALERT_INDEX_PATTERN, "", "")
+func GetRelatedAlerts(alertName string) ([]schema.Alert, error) {
+	result, err := ElasticSearch(configurations.ALERT_INDEX_PATTERN, "name", alertName)
 	if err != nil {
 		return nil, fmt.Errorf("error getting historical alerts: %v", err)
 	}
 
 	var alerts []schema.Alert
-	err = json.Unmarshal(result, &alerts)
-	if err != nil {
+	if err := json.Unmarshal(result, &alerts); err != nil {
 		return nil, fmt.Errorf("error unmarshalling alerts: %v", err)
 	}
 
 	return alerts, nil
 }
 
-func FindRelatedAlerts(currentAlert schema.Alert) (*AlertCorrelation, error) {
-	correlation := &AlertCorrelation{
-		CurrentAlert:    currentAlert,
-		RelatedAlerts:   make([]schema.Alert, 0),
-		Classifications: make([]string, 0),
-	}
-
-	historicalResponses, err := GetRelatedAlerts()
+func FindRelatedAlerts(current schema.Alert) (*AlertCorrelation, error) {
+	alerts, err := GetRelatedAlerts(current.Name)
 	if err != nil {
 		return nil, err
 	}
 
-	for _, hist := range historicalResponses {
-		if isAlertRelated(currentAlert, hist) {
-			correlation.RelatedAlerts = append(correlation.RelatedAlerts, hist)
-
-			classification := "This alert has not been classified"
-			if len(hist.Tags) > 0 {
-				classification = strings.Join(hist.Tags, ", ")
+	corr := &AlertCorrelation{CurrentAlert: current}
+	for _, hist := range alerts {
+		if hist.ID == current.ID {
+			continue
+		}
+		if related, matches := isAlertRelated(current, hist); related {
+			classif := getAlertClassification(hist)
+			for _, m := range matches {
+				incrementCount(&corr.Counts, m, classif)
 			}
-			correlation.Classifications = append(correlation.Classifications, classification)
+			corr.RelatedAlerts = append(corr.RelatedAlerts, hist)
 		}
 	}
-
-	utils.Logger.Info("Completed related alerts search. Found %d related alerts for ID: %s",
-		len(correlation.RelatedAlerts), currentAlert.ID)
-
-	return correlation, nil
+	return corr, nil
 }
 
-func isAlertRelated(current, historical schema.Alert) bool {
-	if current.ID == historical.ID {
-		return false
+func isAlertRelated(current, historical schema.Alert) (bool, []string) {
+	if current.ID == historical.ID || current.Name != historical.Name {
+		return false, nil
 	}
 
-	if current.Destination.IP != "" && current.Destination.IP == historical.Destination.IP {
-		return true
+	var matches []string
+
+	if current.Source.IP != "" && current.Source.IP == historical.Source.IP {
+		matches = append(matches, "SourceIP")
 	}
-	if current.Destination.Port != 0 && current.Destination.Port == historical.Destination.Port {
-		return true
+	if current.Destination.IP != "" && current.Destination.IP == historical.Destination.IP {
+		matches = append(matches, "DestinationIP")
 	}
-	if current.Destination.Host != "" && current.Destination.Host == historical.Destination.Host {
-		return true
+	if current.Source.User != "" && current.Source.User == historical.Source.User {
+		matches = append(matches, "SourceUser")
 	}
 	if current.Destination.User != "" && current.Destination.User == historical.Destination.User {
-		return true
+		matches = append(matches, "DestinationUser")
 	}
 
-	if current.Source.IP != "" && current.Source.IP == historical.Source.IP {
-		return true
+	sort.Strings(matches)
+	return len(matches) > 0, matches
+}
+
+func getAlertClassification(alert schema.Alert) string {
+	if len(alert.Tags) == 0 {
+		return "Unclassified alert"
 	}
-	if current.Source.Port != 0 && current.Source.Port == historical.Source.Port {
-		return true
+	switch strings.ToLower(alert.Tags[0]) {
+	case "possible incident":
+		return "Possible incident"
+	case "false positive":
+		return "False positive"
+	case "standard alert":
+		return "Standard alert"
+	default:
+		return "Unclassified alert"
 	}
-	if current.Source.Host != "" && current.Source.Host == historical.Source.Host {
-		return true
+}
+
+func incrementCount(cnts *MatchTypeCounts, matchType, classif string) {
+	var ac *AlertCounts
+
+	switch matchType {
+	case "SourceIP":
+		ac = &cnts.SourceIP
+	case "DestinationIP":
+		ac = &cnts.DestinationIP
+	case "SourceUser":
+		ac = &cnts.SourceUser
+	case "DestinationUser":
+		ac = &cnts.DestinationUser
 	}
-	if current.Source.User != "" && current.Source.User == historical.Source.User {
-		return true
+	switch classif {
+	case "Possible incident":
+		ac.Incidents++
+	case "False positive":
+		ac.FalsePositive++
+	case "Standard Alert":
+		ac.Standard++
+	default:
+		ac.Unclassified++
 	}
-
-	return false
 }
 
-func BuildCorrelationContext(correlation *AlertCorrelation) string {
-	var context strings.Builder
-
-	context.WriteString("\nHistorical Context:\n")
-	context.WriteString(fmt.Sprintf("Found %d related alerts with similar characteristics:\n", len(correlation.RelatedAlerts)))
-
-	for i, alert := range correlation.RelatedAlerts {
-		context.WriteString(fmt.Sprintf("\nRelated Alert %d:\n", i+1))
-		context.WriteString(fmt.Sprintf("- Name: %s\n", alert.Name))
-		context.WriteString(fmt.Sprintf("- Severity: %s\n", alert.SeverityLabel))
-		context.WriteString(fmt.Sprintf("- Category: %s\n", alert.Category))
-
-		classification := "This alert has not been classified"
-		if i < len(correlation.Classifications) {
-			classification = correlation.Classifications[i]
-		}
-		context.WriteString(fmt.Sprintf("- Classification: %s\n", classification))
-
-		context.WriteString(fmt.Sprintf("- Time: %s\n", alert.Timestamp))
-
-		if alert.Source.IP != "" {
-			context.WriteString(fmt.Sprintf("- Source IP: %s\n", alert.Source.IP))
-		}
-		if alert.Destination.IP != "" {
-			context.WriteString(fmt.Sprintf("- Destination IP: %s\n", alert.Destination.IP))
-		}
-		if alert.Source.Host != "" {
-			context.WriteString(fmt.Sprintf("- Source Host: %s\n", alert.Source.Host))
-		}
-		if alert.Destination.Host != "" {
-			context.WriteString(fmt.Sprintf("- Destination Host: %s\n", alert.Destination.Host))
-		}
-		if alert.Source.User != "" {
-			context.WriteString(fmt.Sprintf("- Source User: %s\n", alert.Source.User))
+func BuildCorrelationContext(corr *AlertCorrelation) string {
+	if corr == nil || len(corr.RelatedAlerts) == 0 {
+		return "No related alerts exist"
+	}
+	// Group alerts by matches and classifications
+	// Example: "SourceIP+DestinationIP" -> { "Possible incident": 2, "False positive": 1 }
+	groups := make(map[string]map[string]int)
+	for _, alert := range corr.RelatedAlerts {
+		if rel, mts := isAlertRelated(corr.CurrentAlert, alert); rel {
+			key := strings.Join(mts, "+")
+			if _, ok := groups[key]; !ok {
+				groups[key] = make(map[string]int)
+			}
+			classif := getAlertClassification(alert)
+			groups[key][classif]++
 		}
-		if alert.Destination.User != "" {
-			context.WriteString(fmt.Sprintf("- Destination User: %s\n", alert.Destination.User))
+	}
+	// Ordered summary
+	var sb strings.Builder
+	total := len(corr.RelatedAlerts)
+	sb.WriteString("\nHistorical Context: ")
+	sb.WriteString(fmt.Sprintf("In the past, there are %d alerts with the same name", total))
+
+	// Ordered keys
+	keys := make([]string, 0, len(groups))
+	for k := range groups {
+		keys = append(keys, k)
+	}
+	sort.Strings(keys)
+
+	for _, k := range keys {
+		sub := groups[k]
+		// Count total for this group
+		n := 0
+		for _, v := range sub {
+			n += v
 		}
-		if alert.Source.Port != 0 {
-			context.WriteString(fmt.Sprintf("- Source Port: %d\n", alert.Source.Port))
+		sb.WriteString(fmt.Sprintf("\n- %d match the same %s", n, translateMatchTypes(strings.Split(k, "+"))))
+		if n > 0 {
+			sb.WriteString(" and of these " + formatClassifications(sub))
 		}
-		if alert.Destination.Port != 0 {
-			context.WriteString(fmt.Sprintf("- Destination Port: %d\n", alert.Destination.Port))
+	}
+	return sb.String()
+}
+
+var matchTypeNames = map[string]string{
+	"SourceIP":        "Source IP",
+	"DestinationIP":   "Destination IP",
+	"SourceUser":      "Source User",
+	"DestinationUser": "Destination User",
+}
+
+func translateMatchTypes(types []string) string {
+	sort.Strings(types)
+	var out []string
+	for _, t := range types {
+		if name, ok := matchTypeNames[t]; ok {
+			out = append(out, name)
 		}
 	}
+	return strings.Join(out, " and ")
+}
 
-	return context.String()
+func formatClassifications(m map[string]int) string {
+	parts := make([]string, 0, len(m))
+	for classif, cnt := range m {
+		parts = append(parts, fmt.Sprintf("%d were classified as %s", cnt, classif))
+	}
+	sort.Strings(parts)
+	return strings.Join(parts, ", ")
 }

Original file line number	Diff line number	Diff line change
`@@ -61,7 +61,7 @@ var (`
`61`	`61`	`}`
`62`	`62`	GPT_INSTRUCTION = "You are an expert security engineer. Perform a deep analysis of an alert created by a SIEM and the logs related to it. Determine if the alert could be an actual potential threat or not and explain why. Provide a description that shows a deep understanding of the alert based on a deep analysis of its logs and estimate the risk to the systems affected. Classify the alert in the following manner: if the alert information is sufficient to determine that the security, availability, confidentiality, or integrity of the systems has being compromised, then classify it as \"possible incident\". If the alert does not pose a security risk to the organization or has no security relevance, classify it as \"possible false positive\". If the alert does not pose an imminent risk to the systems, requires no urgent action from an administrator, or requires not urgent review by an administrator, it should be classified as a \"standard alert\". You will also provide context-specific instructions for remediation, mitigation, or further investigation, related to the alert and logs analyzed. Your answer should be provided using the following JSON format and the total number of characters in your answer must not exceed 1500 words. Your entire answer must be inside this json format. {\"activity_id\":\"<activity_id>\",\"classification\":\"<classification>\",\"reasoning\":[\"<deep_reasoning>\"],\"nextSteps\":[{\"step\":1,\"action\":\"<action_1>\",\"details\":\"<action_1_details>\"},{\"step\":2,\"action\":\"<action_2>\",\"details\":\"<action_2_details>\"},{\"step\":3,\"action\":\"<action_3>\"]}Ensure that your entire answer adheres to the provided JSON format. The response should be valid JSON syntax and schema."
`63`	`63`	GPT_FALSE_POSITIVE = "This alert is categorized as a potential false positive due to two key factors. Firstly, it originates from an automated system, which may occasionally produce alerts without direct human validation. Additionally, the absence of any correlated logs further raises suspicion, as a genuine incident typically leaves a trail of relevant log entries. Hence, the combination of its system-generated nature and the lack of associated logs suggests a likelihood of being a false positive rather than a genuine security incident."
`64`		`- CORRELATION_CONTEXT = "\n\nAlert Context: The current alert has historical correlation with previous alerts:\n%s"`
	`64`	`+ CORRELATION_CONTEXT = "\n\nThe current alert has historical correlation with previous alerts:\n%s"`
`65`	`65`	`)`
`66`	`66`
`67`	`67`	`func GetInternalKey() string {`