linuxfoundation · mbani01 · Mar 23, 2026 · Mar 23, 2026 · Mar 23, 2026 · Mar 24, 2026
diff --git a/services/apps/git_integration/src/crowdgit/services/software_value/main.go b/services/apps/git_integration/src/crowdgit/services/software_value/main.go
@@ -3,6 +3,7 @@ package main
 import (
 	"context"
 	"encoding/json"
+	"flag"
 	"fmt"
 	"os"
 	"os/exec"
@@ -13,23 +14,27 @@ import (
 )
 
 func main() {
-	response := processRepository()
+	noLarge := flag.Bool("no-large", false, "Skip files larger than 100MB to avoid OOM on large repos")
+	flag.Parse()
+
+	response := processRepository(*noLarge)
 	outputJSON(response)
 
 	// Always exit with code 0 - status details are in JSON response
 }
 
 // processRepository handles the main logic and returns a StandardResponse
-func processRepository() StandardResponse {
+func processRepository(noLarge bool) StandardResponse {
 	ctx := context.Background()
 
-	// Get target path from command line argument
+	// Get target path from remaining non-flag arguments
+	args := flag.Args()
 	var targetPath string
-	if len(os.Args) > 1 {
-		targetPath = os.Args[1]
+	if len(args) > 0 {
+		targetPath = args[0]
 	} else {
 		errorCode := ErrorCodeInvalidArguments
-		errorMessage := fmt.Sprintf("Usage: %s <target-path>", os.Args[0])
+		errorMessage := fmt.Sprintf("Usage: %s [--no-large] <target-path>", os.Args[0])
 		return StandardResponse{
 			Status:       StatusFailure,
 			ErrorCode:    &errorCode,
@@ -51,10 +56,10 @@ func processRepository() StandardResponse {
 	// Process single repository (the target path argument)
 	repoDir := config.TargetPath
 
-	insightsDb, err := NewInsightsDB(ctx, config.InsightsDatabase)
-	if err != nil {
+	insightsDb, dbErr := NewInsightsDB(ctx, config.InsightsDatabase)
+	if dbErr != nil {
 		errorCode := ErrorCodeDatabaseConnection
-		errorMessage := fmt.Sprintf("Error connecting to insights database: %v", err)
+		errorMessage := fmt.Sprintf("Error connecting to insights database: %v", dbErr)
 		return StandardResponse{
 			Status:       StatusFailure,
 			ErrorCode:    &errorCode,
@@ -76,7 +81,7 @@ func processRepository() StandardResponse {
 	}
 
 	// Process the repository with SCC
-	report, err := getSCCReport(config.SCCPath, repoDir)
+	report, err := getSCCReport(config.SCCPath, repoDir, noLarge)
 	if err != nil {
 		errorCode := getErrorCodeFromSCCError(err)
 		errorMessage := fmt.Sprintf("Error processing repository '%s': %v", repoDir, err)
@@ -120,10 +125,10 @@ func processRepository() StandardResponse {
 
 
 // getSCCReport analyzes a directory with scc and returns a report containing the estimated cost and language statistics.
-func getSCCReport(sccPath, dirPath string) (SCCReport, error) {
-	cost, err := getCost(sccPath, dirPath)
+func getSCCReport(sccPath, dirPath string, noLarge bool) (SCCReport, error) {
+	cost, err := getCost(sccPath, dirPath, noLarge)
 	if err != nil {
-		return SCCReport{}, fmt.Errorf("error getting SCC report for '%s': %v\"", err)
+		return SCCReport{}, fmt.Errorf("error getting SCC report for '%s': %v", dirPath, err)
 	}
 
 	// Skip saving to database if cost is 0 - do we want to do this?
@@ -133,7 +138,7 @@ func getSCCReport(sccPath, dirPath string) (SCCReport, error) {
 
 	projectPath := filepath.Base(dirPath)
 
-	langStats, err := getLanguageStats(sccPath, dirPath)
+	langStats, err := getLanguageStats(sccPath, dirPath, noLarge)
 	if err != nil {
 		return SCCReport{}, fmt.Errorf("error getting language stats for '%s': %v", dirPath, err)
 	}
@@ -177,8 +182,8 @@ func getGitRepositoryURL(dirPath string) (string, error) {
 }
 
 // getCost runs the scc command and parses the output to get the estimated cost.
-func getCost(sccPathPath, repoPath string) (float64, error) {
-	output, err := runSCC(sccPathPath, "--format=short", repoPath)
+func getCost(sccPathPath, repoPath string, noLarge bool) (float64, error) {
+	output, err := runSCC(sccPathPath, noLarge, "--format=short", repoPath)
 	if err != nil {
 		return 0, fmt.Errorf("failed to run scc command: %w", err)
 	}
@@ -192,8 +197,8 @@ func getCost(sccPathPath, repoPath string) (float64, error) {
 }
 
 // getLanguageStats runs the scc command and parses the output to get language statistics.
-func getLanguageStats(sccPathPath, repoPath string) ([]LanguageStats, error) {
-	output, err := runSCC(sccPathPath, "--format=json", repoPath)
+func getLanguageStats(sccPathPath, repoPath string, noLarge bool) ([]LanguageStats, error) {
+	output, err := runSCC(sccPathPath, noLarge, "--format=json", repoPath)
 	if err != nil {
 		return nil, fmt.Errorf("failed to run scc command: %w", err)
 	}
@@ -207,8 +212,14 @@ func getLanguageStats(sccPathPath, repoPath string) ([]LanguageStats, error) {
 }
 
 // runSCC executes the scc command with the given arguments and returns the output.
-func runSCC(sccPathPath string, args ...string) (string, error) {
-	cmd := exec.Command(sccPathPath, args...)
+// When noLarge is true, files larger than 100MB are skipped to avoid OOM on large repos.
+func runSCC(sccPathPath string, noLarge bool, args ...string) (string, error) {
+	var cmdArgs []string
+	if noLarge {
+		cmdArgs = append(cmdArgs, "--no-large", "--large-byte-count", "100000000")
+	}
+	cmdArgs = append(cmdArgs, args...)
+	cmd := exec.Command(sccPathPath, cmdArgs...)
 	output, err := cmd.Output()
 	if err != nil {
 		if exitErr, ok := err.(*exec.ExitError); ok {

diff --git a/services/apps/git_integration/src/crowdgit/services/software_value/software_value_service.py b/services/apps/git_integration/src/crowdgit/services/software_value/software_value_service.py
@@ -1,4 +1,5 @@
 import json
+import subprocess
 import time
 from decimal import Decimal
 
@@ -8,6 +9,21 @@
 from crowdgit.services.base.base_service import BaseService
 from crowdgit.services.utils import run_shell_command
 
+_LARGE_REPO_THRESHOLD_BYTES = 10 * 1024 * 1024 * 1024  # 10 GB
+
+
+def _get_repo_size_bytes(repo_path: str) -> int:
+    """Return total disk usage of repo_path in bytes using du -sb."""
+    try:
+        result = subprocess.run(
+            ["du", "-sb", repo_path], capture_output=True, text=True, timeout=120
+        )
+        if result.returncode == 0:
+            return int(result.stdout.split()[0])
+    except Exception:
+        pass
+    return 0
+
 
 class SoftwareValueService(BaseService):
     """Service for calculating software value metrics"""
@@ -20,16 +36,30 @@ def __init__(self):
     async def run(self, repo_id: str, repo_path: str) -> None:
         """
         Triggers software value binary for given repo.
-        Results are saved into insights database directly
+        Results are saved into insights database directly.
+        For repos larger than 10 GB, scc is run with minimum parallelism (1 worker)
+        to avoid OOM; results are identical.
-        For repos larger than 10 GB, scc is run with minimum parallelism (1 worker)
-        to avoid OOM; results are identical.
+        For repos larger than 10 GB, scc is run with the --no-large flag (skipping files >100MB)
+        to reduce memory usage and avoid OOM errors.
-        For repos larger than 10 GB, scc is run with minimum parallelism (1 worker)
-        to avoid OOM; results are identical.
+        For repos larger than 10 GB, scc is run with the --no-large flag (skipping files >100MB)
+        to reduce memory usage and avoid OOM errors.
         """
         start_time = time.time()
         execution_status = ExecutionStatus.SUCCESS
         error_code = None
         error_message = None
 
         try:
+            cmd = [self.software_value_executable]
+
+            repo_size = _get_repo_size_bytes(repo_path)
+            if repo_size >= _LARGE_REPO_THRESHOLD_BYTES:
+                self.logger.info(
+                    f"Repo size {repo_size / (1024**3):.1f} GB exceeds threshold — "
+                    "running scc with no-large (skipping files >100MB)"
+                )
+                cmd += ["--no-large"]
+
+            cmd.append(repo_path)
+
             self.logger.info("Running software value...")
-            output = await run_shell_command([self.software_value_executable, repo_path])
+            output = await run_shell_command(cmd)
             self.logger.info(f"Software value output: {output}")
 
             # Parse JSON output and extract fields from StandardResponse structure