Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package main
import (
"context"
"encoding/json"
"flag"
"fmt"
"os"
"os/exec"
Expand All @@ -13,23 +14,27 @@ import (
)

func main() {
response := processRepository()
noLarge := flag.Bool("no-large", false, "Skip files larger than 100MB to avoid OOM on large repos")
flag.Parse()

response := processRepository(*noLarge)
outputJSON(response)

// Always exit with code 0 - status details are in JSON response
}

// processRepository handles the main logic and returns a StandardResponse
func processRepository() StandardResponse {
func processRepository(noLarge bool) StandardResponse {
ctx := context.Background()

// Get target path from command line argument
// Get target path from remaining non-flag arguments
args := flag.Args()
var targetPath string
if len(os.Args) > 1 {
targetPath = os.Args[1]
if len(args) > 0 {
targetPath = args[0]
} else {
errorCode := ErrorCodeInvalidArguments
errorMessage := fmt.Sprintf("Usage: %s <target-path>", os.Args[0])
errorMessage := fmt.Sprintf("Usage: %s [--no-large] <target-path>", os.Args[0])
return StandardResponse{
Status: StatusFailure,
ErrorCode: &errorCode,
Expand All @@ -51,10 +56,10 @@ func processRepository() StandardResponse {
// Process single repository (the target path argument)
repoDir := config.TargetPath

insightsDb, err := NewInsightsDB(ctx, config.InsightsDatabase)
if err != nil {
insightsDb, dbErr := NewInsightsDB(ctx, config.InsightsDatabase)
if dbErr != nil {
errorCode := ErrorCodeDatabaseConnection
errorMessage := fmt.Sprintf("Error connecting to insights database: %v", err)
errorMessage := fmt.Sprintf("Error connecting to insights database: %v", dbErr)
return StandardResponse{
Status: StatusFailure,
ErrorCode: &errorCode,
Expand All @@ -76,7 +81,7 @@ func processRepository() StandardResponse {
}

// Process the repository with SCC
report, err := getSCCReport(config.SCCPath, repoDir)
report, err := getSCCReport(config.SCCPath, repoDir, noLarge)
if err != nil {
errorCode := getErrorCodeFromSCCError(err)
errorMessage := fmt.Sprintf("Error processing repository '%s': %v", repoDir, err)
Expand Down Expand Up @@ -120,10 +125,10 @@ func processRepository() StandardResponse {


// getSCCReport analyzes a directory with scc and returns a report containing the estimated cost and language statistics.
func getSCCReport(sccPath, dirPath string) (SCCReport, error) {
cost, err := getCost(sccPath, dirPath)
func getSCCReport(sccPath, dirPath string, noLarge bool) (SCCReport, error) {
cost, err := getCost(sccPath, dirPath, noLarge)
if err != nil {
return SCCReport{}, fmt.Errorf("error getting SCC report for '%s': %v\"", err)
return SCCReport{}, fmt.Errorf("error getting SCC report for '%s': %v", dirPath, err)
}

// Skip saving to database if cost is 0 - do we want to do this?
Expand All @@ -133,7 +138,7 @@ func getSCCReport(sccPath, dirPath string) (SCCReport, error) {

projectPath := filepath.Base(dirPath)

langStats, err := getLanguageStats(sccPath, dirPath)
langStats, err := getLanguageStats(sccPath, dirPath, noLarge)
if err != nil {
return SCCReport{}, fmt.Errorf("error getting language stats for '%s': %v", dirPath, err)
}
Expand Down Expand Up @@ -177,8 +182,8 @@ func getGitRepositoryURL(dirPath string) (string, error) {
}

// getCost runs the scc command and parses the output to get the estimated cost.
func getCost(sccPathPath, repoPath string) (float64, error) {
output, err := runSCC(sccPathPath, "--format=short", repoPath)
func getCost(sccPathPath, repoPath string, noLarge bool) (float64, error) {
output, err := runSCC(sccPathPath, noLarge, "--format=short", repoPath)
if err != nil {
return 0, fmt.Errorf("failed to run scc command: %w", err)
}
Expand All @@ -192,8 +197,8 @@ func getCost(sccPathPath, repoPath string) (float64, error) {
}

// getLanguageStats runs the scc command and parses the output to get language statistics.
func getLanguageStats(sccPathPath, repoPath string) ([]LanguageStats, error) {
output, err := runSCC(sccPathPath, "--format=json", repoPath)
func getLanguageStats(sccPathPath, repoPath string, noLarge bool) ([]LanguageStats, error) {
output, err := runSCC(sccPathPath, noLarge, "--format=json", repoPath)
if err != nil {
return nil, fmt.Errorf("failed to run scc command: %w", err)
}
Expand All @@ -207,8 +212,14 @@ func getLanguageStats(sccPathPath, repoPath string) ([]LanguageStats, error) {
}

// runSCC executes the scc command with the given arguments and returns the output.
func runSCC(sccPathPath string, args ...string) (string, error) {
cmd := exec.Command(sccPathPath, args...)
// When noLarge is true, files larger than 100MB are skipped to avoid OOM on large repos.
func runSCC(sccPathPath string, noLarge bool, args ...string) (string, error) {
var cmdArgs []string
if noLarge {
cmdArgs = append(cmdArgs, "--no-large", "--large-byte-count", "100000000")
}
Comment on lines +218 to +220
Copy link

Copilot AI Mar 24, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

runSCC hard-codes the large-file threshold as the string literal "100000000". To reduce magic numbers and keep the threshold consistent across Go/Python (and with the flag help text), consider extracting this into a named constant (and potentially expressing it as 10010001000 or 10010241024 explicitly).

Copilot uses AI. Check for mistakes.
cmdArgs = append(cmdArgs, args...)
cmd := exec.Command(sccPathPath, cmdArgs...)
output, err := cmd.Output()
if err != nil {
if exitErr, ok := err.(*exec.ExitError); ok {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import json
import subprocess
import time
from decimal import Decimal

Expand All @@ -8,6 +9,21 @@
from crowdgit.services.base.base_service import BaseService
from crowdgit.services.utils import run_shell_command

_LARGE_REPO_THRESHOLD_BYTES = 10 * 1024 * 1024 * 1024 # 10 GB


def _get_repo_size_bytes(repo_path: str) -> int:
"""Return total disk usage of repo_path in bytes using du -sb."""
try:
result = subprocess.run(
["du", "-sb", repo_path], capture_output=True, text=True, timeout=120
)
Comment on lines +18 to +20
Copy link

Copilot AI Mar 24, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

_get_repo_size_bytes() uses subprocess.run(...) inside an async service; this is a blocking call and can stall the event loop (and other concurrent repo processing). Consider using the existing async run_shell_command() helper (or asyncio.to_thread) to run du without blocking.

Copilot uses AI. Check for mistakes.
if result.returncode == 0:
return int(result.stdout.split()[0])
except Exception:
pass
return 0
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Blocking synchronous subprocess call in async method

Medium Severity

_get_repo_size_bytes uses synchronous subprocess.run to execute du -sb, which blocks the asyncio event loop when called from the async def run method. Running du -sb on a large repository can take significant time (up to the 120-second timeout). The async run_shell_command utility is already imported in this file and used elsewhere in the codebase for the same purpose (e.g., _get_repo_size_mb in clone_service.py).

Additional Locations (1)
Fix in Cursor Fix in Web

Comment on lines +23 to +25
Copy link

Copilot AI Mar 24, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

_get_repo_size_bytes() silently swallows all exceptions and returns 0. If du fails (e.g., not present, permission issues, timeout), the service will skip enabling --no-large without any signal. Please log a warning (and consider catching narrower exceptions / including stderr) so operators can tell when the size check is not working.

Copilot uses AI. Check for mistakes.


class SoftwareValueService(BaseService):
"""Service for calculating software value metrics"""
Expand All @@ -20,16 +36,30 @@ def __init__(self):
async def run(self, repo_id: str, repo_path: str) -> None:
"""
Triggers software value binary for given repo.
Results are saved into insights database directly
Results are saved into insights database directly.
For repos larger than 10 GB, scc is run with minimum parallelism (1 worker)
to avoid OOM; results are identical.
Comment on lines +40 to +41
Copy link

Copilot AI Mar 24, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The docstring says that for repos >10GB “scc is run with minimum parallelism (1 worker)”, but the actual behavior added below is enabling the --no-large flag (skip files >100MB). Please update the docstring to reflect the real behavior (or implement the parallelism change if that’s what was intended).

Suggested change
For repos larger than 10 GB, scc is run with minimum parallelism (1 worker)
to avoid OOM; results are identical.
For repos larger than 10 GB, scc is run with the --no-large flag (skipping files >100MB)
to reduce memory usage and avoid OOM errors.

Copilot uses AI. Check for mistakes.
"""
start_time = time.time()
execution_status = ExecutionStatus.SUCCESS
error_code = None
error_message = None

try:
cmd = [self.software_value_executable]

repo_size = _get_repo_size_bytes(repo_path)
if repo_size >= _LARGE_REPO_THRESHOLD_BYTES:
self.logger.info(
f"Repo size {repo_size / (1024**3):.1f} GB exceeds threshold — "
"running scc with no-large (skipping files >100MB)"
)
cmd += ["--no-large"]
Comment on lines +51 to +57
Copy link

Copilot AI Mar 24, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

New behavior adds conditional --no-large based on repo size, but there are no tests asserting the command built for large vs small repos. Since this is reliability-critical logic, please add unit coverage (e.g., mock _get_repo_size_bytes / run_shell_command and assert --no-large is included only when expected).

Copilot uses AI. Check for mistakes.

cmd.append(repo_path)

self.logger.info("Running software value...")
output = await run_shell_command([self.software_value_executable, repo_path])
output = await run_shell_command(cmd)
self.logger.info(f"Software value output: {output}")

# Parse JSON output and extract fields from StandardResponse structure
Expand Down
Loading