diff --git a/cmd/flamegraph/flamegraph.go b/cmd/flamegraph/flamegraph.go index 611e34e7..3a6c7b6f 100644 --- a/cmd/flamegraph/flamegraph.go +++ b/cmd/flamegraph/flamegraph.go @@ -59,7 +59,7 @@ const ( func init() { Cmd.Flags().StringVar(&common.FlagInput, common.FlagInputName, "", "") Cmd.Flags().StringSliceVar(&common.FlagFormat, common.FlagFormatName, []string{report.FormatAll}, "") - Cmd.Flags().IntVar(&flagDuration, flagDurationName, 30, "") + Cmd.Flags().IntVar(&flagDuration, flagDurationName, 0, "") Cmd.Flags().IntVar(&flagFrequency, flagFrequencyName, 11, "") Cmd.Flags().IntSliceVar(&flagPids, flagPidsName, nil, "") Cmd.Flags().BoolVar(&flagNoSystemSummary, flagNoSystemSummaryName, false, "") @@ -100,7 +100,7 @@ func getFlagGroups() []common.FlagGroup { flags := []common.Flag{ { Name: flagDurationName, - Help: "number of seconds to run the collection", + Help: "number of seconds to run the collection. If 0, the collection will run indefinitely. Ctrl+c to stop.", }, { Name: flagFrequencyName, @@ -155,19 +155,19 @@ func validateFlags(cmd *cobra.Command, args []string) error { return common.FlagValidationError(cmd, fmt.Sprintf("input file %s does not exist", common.FlagInput)) } } - if flagDuration <= 0 { - return common.FlagValidationError(cmd, "duration must be greater than 0") + if flagDuration < 0 { + return common.FlagValidationError(cmd, "duration must be 0 or greater") } if flagFrequency <= 0 { - return common.FlagValidationError(cmd, "frequency must be greater than 0") + return common.FlagValidationError(cmd, "frequency must be 1 or greater") } for _, pid := range flagPids { if pid < 0 { - return common.FlagValidationError(cmd, "PID must be greater than or equal to 0") + return common.FlagValidationError(cmd, "PID must be 0 or greater") } } if flagMaxDepth < 0 { - return common.FlagValidationError(cmd, "max depth must be greater than or equal to 0") + return common.FlagValidationError(cmd, "max depth must be 0 or greater") } // common target flags if err := common.ValidateTargetFlags(cmd); err != nil { diff --git a/cmd/metrics/metadata.go b/cmd/metrics/metadata.go index fc6328e7..2249abb3 100644 --- a/cmd/metrics/metadata.go +++ b/cmd/metrics/metadata.go @@ -692,7 +692,7 @@ func getUncoreDeviceIDs(isAMDArchitecture bool, scriptOutputs map[string]script. // getCPUInfo - reads and returns all data from /proc/cpuinfo func getCPUInfo(t target.Target) (cpuInfo []map[string]string, err error) { cmd := exec.Command("cat", "/proc/cpuinfo") - stdout, stderr, exitcode, err := t.RunCommand(cmd, 0, true) + stdout, stderr, exitcode, err := t.RunCommand(cmd) if err != nil { err = fmt.Errorf("failed to get cpuinfo: %s, %d, %v", stderr, exitcode, err) return @@ -717,7 +717,7 @@ func getCPUInfo(t target.Target) (cpuInfo []map[string]string, err error) { // getLscpu - runs lscpu on the target and returns the output func getLscpu(t target.Target) (output string, err error) { cmd := exec.Command("lscpu") - output, stderr, exitcode, err := t.RunCommand(cmd, 0, true) + output, stderr, exitcode, err := t.RunCommand(cmd) if err != nil || exitcode != 0 { err = fmt.Errorf("failed to run lscpu: %s, %d, %v", stderr, exitcode, err) return diff --git a/cmd/metrics/nmi_watchdog.go b/cmd/metrics/nmi_watchdog.go index 8f0644da..e1ef1c44 100644 --- a/cmd/metrics/nmi_watchdog.go +++ b/cmd/metrics/nmi_watchdog.go @@ -49,7 +49,7 @@ func getNMIWatchdog(myTarget target.Target) (setting string, err error) { return } cmd := exec.Command(sysctl, "kernel.nmi_watchdog") // #nosec G204 // nosemgrep - stdout, _, _, err := myTarget.RunCommand(cmd, 0, true) + stdout, _, _, err := myTarget.RunCommand(cmd) if err != nil { return } @@ -88,7 +88,7 @@ func setNMIWatchdog(myTarget target.Target, setting string, localTempDir string) // findSysctl - gets a useable path to sysctl or error func findSysctl(myTarget target.Target) (path string, err error) { cmd := exec.Command("which", "sysctl") - stdout, _, _, err := myTarget.RunCommand(cmd, 0, true) + stdout, _, _, err := myTarget.RunCommand(cmd) if err == nil { //found it path = strings.TrimSpace(stdout) @@ -97,7 +97,7 @@ func findSysctl(myTarget target.Target) (path string, err error) { // didn't find it on the path, try being specific sbinPath := "/usr/sbin/sysctl" cmd = exec.Command("which", sbinPath) - _, _, _, err = myTarget.RunCommand(cmd, 0, true) + _, _, _, err = myTarget.RunCommand(cmd) if err == nil { // found it path = sbinPath diff --git a/cmd/metrics/process.go b/cmd/metrics/process.go index f71f1b94..4c92f1ae 100644 --- a/cmd/metrics/process.go +++ b/cmd/metrics/process.go @@ -67,7 +67,7 @@ func GetHotProcesses(myTarget target.Target, maxProcesses int, filter string) (p } // run ps to get list of processes sorted by cpu utilization (descending) cmd := exec.Command("ps", "-a", "-x", "-h", "-o", "pid,ppid,comm,cmd", "--sort=-%cpu") - stdout, stderr, exitcode, err := myTarget.RunCommand(cmd, 0, true) + stdout, stderr, exitcode, err := myTarget.RunCommand(cmd) if err != nil { err = fmt.Errorf("failed to get hot processes: %s, %d, %v", stderr, exitcode, err) return @@ -177,7 +177,7 @@ done | sort -nr | head -n %d func processExists(myTarget target.Target, pid string) (exists bool) { cmd := exec.Command("ps", "-p", pid) - _, _, _, err := myTarget.RunCommand(cmd, 0, true) + _, _, _, err := myTarget.RunCommand(cmd) if err != nil { exists = false return @@ -188,7 +188,7 @@ func processExists(myTarget target.Target, pid string) (exists bool) { func getProcess(myTarget target.Target, pid string) (process Process, err error) { cmd := exec.Command("ps", "-q", pid, "h", "-o", "pid,ppid,comm,cmd", "ww") - stdout, stderr, exitcode, err := myTarget.RunCommand(cmd, 0, true) + stdout, stderr, exitcode, err := myTarget.RunCommand(cmd) if err != nil { err = fmt.Errorf("failed to get process: %s, %d, %v", stderr, exitcode, err) return diff --git a/cmd/telemetry/telemetry.go b/cmd/telemetry/telemetry.go index ce89a579..aaf40106 100644 --- a/cmd/telemetry/telemetry.go +++ b/cmd/telemetry/telemetry.go @@ -122,7 +122,7 @@ func init() { Cmd.Flags().StringVar(&common.FlagInput, common.FlagInputName, "", "") Cmd.Flags().BoolVar(&flagAll, flagAllName, true, "") Cmd.Flags().StringSliceVar(&common.FlagFormat, common.FlagFormatName, []string{report.FormatAll}, "") - Cmd.Flags().IntVar(&flagDuration, flagDurationName, 30, "") + Cmd.Flags().IntVar(&flagDuration, flagDurationName, 0, "") Cmd.Flags().IntVar(&flagInterval, flagIntervalName, 2, "") Cmd.Flags().IntVar(&flagInstrMixPid, flagInstrMixPidName, 0, "") Cmd.Flags().IntVar(&flagInstrMixFrequency, flagInstrMixFrequencyName, instrmixFrequencyDefaultSystemWide, "") @@ -244,9 +244,6 @@ func validateFlags(cmd *cobra.Command, args []string) error { if flagDuration < 0 { return common.FlagValidationError(cmd, "duration must be 0 or greater") } - if flagDuration == 0 && (cmd.Flags().Lookup(common.FlagTargetsFileName).Changed || cmd.Flags().Lookup(common.FlagTargetHostName).Changed) { - return common.FlagValidationError(cmd, "duration must be greater than 0 when collecting from a remote target") - } if flagInstrMixFrequency < 100000 { // 100,000 instructions is the minimum frequency return common.FlagValidationError(cmd, "instruction mix frequency must be 100,000 or greater to limit overhead") } diff --git a/internal/common/common.go b/internal/common/common.go index f7442b14..4bd1c9e3 100644 --- a/internal/common/common.go +++ b/internal/common/common.go @@ -6,10 +6,12 @@ package common // SPDX-License-Identifier: BSD-3-Clause import ( + "context" "errors" "fmt" "log/slog" "os" + "os/exec" "os/signal" "path/filepath" "perfspect/internal/progress" @@ -20,6 +22,7 @@ import ( "perfspect/internal/util" "strings" "syscall" + "time" "slices" @@ -118,22 +121,6 @@ func (rc *ReportingCommand) Run() error { localTempDir := appContext.LocalTempDir outputDir := appContext.OutputDir logFilePath := appContext.LogFilePath - // handle signals - // child processes will exit when the signals are received which will - // allow this app to exit normally - sigChannel := make(chan os.Signal, 1) - signal.Notify(sigChannel, syscall.SIGINT, syscall.SIGTERM) - go func() { - sig := <-sigChannel - slog.Info("received signal", slog.String("signal", sig.String())) - // when perfspect receives ctrl-c while in the shell, the shell makes sure to propogate the - // signal to all our children. But when perfspect is run in the background or disowned and - // then receives SIGINT, e.g., from a script, we need to send the signal to our children - err := util.SignalChildren(syscall.SIGINT) - if err != nil { - slog.Error("error sending signal to children", slog.String("error", err.Error())) - } - }() // create output directory err := util.CreateDirectoryIfNotExists(outputDir, 0755) // #nosec G301 if err != nil { @@ -144,8 +131,8 @@ func (rc *ReportingCommand) Run() error { return err } - var orderedTargetScriptOutputs []TargetScriptOutputs var myTargets []target.Target + var orderedTargetScriptOutputs []TargetScriptOutputs if FlagInput != "" { var err error orderedTargetScriptOutputs, err = outputsFromInput(rc.Tables, rc.SummaryTableName) @@ -203,6 +190,8 @@ func (rc *ReportingCommand) Run() error { for i := len(indicesToRemove) - 1; i >= 0; i-- { myTargets = slices.Delete(myTargets, indicesToRemove[i], indicesToRemove[i]+1) } + // set up signal handler to help with cleaning up child processes on ctrl-c/SIGINT or SIGTERM + configureSignalHandler(myTargets, multiSpinner.Status) // collect data from targets orderedTargetScriptOutputs, err = outputsFromTargets(rc.Cmd, myTargets, rc.Tables, rc.ScriptParams, multiSpinner.Status, localTempDir) if err != nil { @@ -299,6 +288,94 @@ func (rc *ReportingCommand) Run() error { return nil } +// configureSignalHandler sets up a signal handler to catch SIGINT and SIGTERM +// +// When perfspect receives ctrl-c while in the shell, the shell propagates the +// signal to all our children. But when perfspect is run in the background or disowned and +// then receives SIGINT, e.g., from a script, we need to send the signal to our children +// +// Also, when running scripts in parallel using the parallel_master.sh script, we need to +// send the signal to the parallel_master.sh script on each target so that it can clean up +// its child processes. This is because the parallel_master.sh script is run in its own process group +// and does not receive the signal when perfspect receives it. +// +// Parameters: +// - myTargets: The list of targets to send the signal to. +// - statusFunc: A function to update the status of the progress indicator. +func configureSignalHandler(myTargets []target.Target, statusFunc progress.MultiSpinnerUpdateFunc) { + sigChannel := make(chan os.Signal, 1) + signal.Notify(sigChannel, syscall.SIGINT, syscall.SIGTERM) + go func() { + sig := <-sigChannel + slog.Debug("received signal", slog.String("signal", sig.String())) + // Scripts that are run in parallel using the parallel_master.sh script and a few other sequential scripts need to be handled specially + // because they are run in their own process group, we need to send the signal directly to the PID of the script. + // For every target, look for the primary_collection_script PID file and send SIGINT to it. + for _, t := range myTargets { + if statusFunc != nil { + _ = statusFunc(t.GetName(), "Signal received, cleaning up...") + } + pidFilePath := filepath.Join(t.GetTempDirectory(), "primary_collection_script.pid") + stdout, _, exitcode, err := t.RunCommandEx(exec.Command("cat", pidFilePath), 5, false, true) // #nosec G204 + if err != nil { + slog.Error("error retrieving target primary_collection_script PID", slog.String("target", t.GetName()), slog.String("error", err.Error())) + } + if exitcode == 0 { + pidStr := strings.TrimSpace(stdout) + _, _, _, err := t.RunCommandEx(exec.Command("sudo", "kill", "-SIGINT", pidStr), 5, false, true) // #nosec G204 + if err != nil { + slog.Error("error sending signal to target primary_collection_script", slog.String("target", t.GetName()), slog.String("error", err.Error())) + } + } + } + // now wait until all primary collection scripts have exited + slog.Debug("waiting for primary_collection_script scripts to exit") + for _, t := range myTargets { + // create a per-target timeout context + targetTimeout := 10 * time.Second + ctx, cancel := context.WithTimeout(context.Background(), targetTimeout) + timedOut := false + pidFilePath := filepath.Join(t.GetTempDirectory(), "primary_collection_script.pid") + for { + // check for timeout + select { + case <-ctx.Done(): + if statusFunc != nil { + _ = statusFunc(t.GetName(), "cleanup timeout exceeded") + } + slog.Warn("signal handler cleanup timeout exceeded for target", slog.String("target", t.GetName())) + timedOut = true + default: + } + if timedOut { + break + } + // read the pid file + stdout, _, exitcode, err := t.RunCommandEx(exec.Command("cat", pidFilePath), 5, false, true) // #nosec G204 + if err != nil || exitcode != 0 { + // pid file doesn't exist + break + } + pidStr := strings.TrimSpace(stdout) + // determine if the process still exists + _, _, exitcode, err = t.RunCommandEx(exec.Command("ps", "-p", pidStr), 5, false, true) // #nosec G204 + if err != nil || exitcode != 0 { + break // process no longer exists, script has exited + } + // sleep for a short time before checking again + time.Sleep(500 * time.Millisecond) + } + cancel() + } + + // send SIGINT to perfspect's children + err := util.SignalChildren(syscall.SIGINT) + if err != nil { + slog.Error("error sending signal to children", slog.String("error", err.Error())) + } + }() +} + // DefaultInsightsFunc returns the insights table values from the table values func DefaultInsightsFunc(allTableValues []table.TableValues, scriptOutputs map[string]script.ScriptOutput) table.TableValues { insightsTableValues := table.TableValues{ @@ -554,7 +631,8 @@ func outputsFromTargets(cmd *cobra.Command, myTargets []target.Target, tables [] scriptsToRunOnTarget = append(scriptsToRunOnTarget, script) } // run the selected scripts on the target - go collectOnTarget(target, scriptsToRunOnTarget, localTempDir, scriptParams["Duration"], cmd.Name() == "telemetry", channelTargetScriptOutputs, channelError, statusUpdate) + ctrlCToStop := cmd.Name() == "telemetry" || cmd.Name() == "flamegraph" + go collectOnTarget(target, scriptsToRunOnTarget, localTempDir, scriptParams["Duration"], ctrlCToStop, channelTargetScriptOutputs, channelError, statusUpdate) } // wait for scripts to run on all targets var allTargetScriptOutputs []TargetScriptOutputs @@ -631,10 +709,10 @@ func elevatedPrivilegesRequired(tables []table.TableDefinition) bool { } // collectOnTarget runs the scripts on the target and sends the results to the appropriate channels -func collectOnTarget(myTarget target.Target, scriptsToRun []script.ScriptDefinition, localTempDir string, duration string, isTelemetry bool, channelTargetScriptOutputs chan TargetScriptOutputs, channelError chan error, statusUpdate progress.MultiSpinnerUpdateFunc) { +func collectOnTarget(myTarget target.Target, scriptsToRun []script.ScriptDefinition, localTempDir string, duration string, ctrlCToStop bool, channelTargetScriptOutputs chan TargetScriptOutputs, channelError chan error, statusUpdate progress.MultiSpinnerUpdateFunc) { // run the scripts on the target status := "collecting data" - if isTelemetry && duration == "0" { // telemetry is the only command that uses this common code that can run indefinitely + if ctrlCToStop && duration == "0" { status += ", press Ctrl+c to stop" } else if duration != "0" && duration != "" { status += fmt.Sprintf(" for %s seconds", duration) diff --git a/internal/common/targets.go b/internal/common/targets.go index cc0d3a84..0f6357be 100644 --- a/internal/common/targets.go +++ b/internal/common/targets.go @@ -495,7 +495,7 @@ func parseMountOutput(mountOutput string) ([]mountRecord, error) { // isDirNoExec checks if the target directory is on a file system that is mounted with noexec. func isDirNoExec(t target.Target, dir string) (bool, error) { dfCmd := exec.Command("df", "-P", dir) - dfOutput, _, _, err := t.RunCommand(dfCmd, 0, true) + dfOutput, _, _, err := t.RunCommand(dfCmd) if err != nil { err = fmt.Errorf("failed to run df command: %w", err) return false, err @@ -509,7 +509,7 @@ func isDirNoExec(t target.Target, dir string) (bool, error) { return false, err } mountCmd := exec.Command("mount") - mountOutput, _, _, err := t.RunCommand(mountCmd, 0, true) + mountOutput, _, _, err := t.RunCommand(mountCmd) if err != nil { err = fmt.Errorf("failed to run mount command: %w", err) return false, err @@ -553,7 +553,7 @@ func GetTargetVendor(t target.Target) (string, error) { if vendor == "" { cmd := exec.Command("bash", "-c", "lscpu | grep -i \"^Vendor ID:\" | awk '{print $NF}'") var err error - vendor, _, _, err = t.RunCommand(cmd, 0, true) + vendor, _, _, err = t.RunCommand(cmd) if err != nil { return "", fmt.Errorf("failed to get target CPU vendor: %v", err) } @@ -568,7 +568,7 @@ func GetTargetFamily(t target.Target) (string, error) { if family == "" { cmd := exec.Command("bash", "-c", "lscpu | grep -i \"^CPU family:\" | awk '{print $NF}'") var err error - family, _, _, err = t.RunCommand(cmd, 0, true) + family, _, _, err = t.RunCommand(cmd) if err != nil { return "", fmt.Errorf("failed to get target CPU family: %v", err) } @@ -583,7 +583,7 @@ func GetTargetModel(t target.Target) (string, error) { if model == "" { cmd := exec.Command("bash", "-c", "lscpu | grep -i \"^Model:\" | awk '{print $NF}'") var err error - model, _, _, err = t.RunCommand(cmd, 0, true) + model, _, _, err = t.RunCommand(cmd) if err != nil { return "", fmt.Errorf("failed to get target CPU model: %v", err) } @@ -598,7 +598,7 @@ func GetTargetStepping(t target.Target) (string, error) { if stepping == "" { cmd := exec.Command("bash", "-c", "lscpu | grep -i \"^Stepping:\" | awk '{print $NF}'") var err error - stepping, _, _, err = t.RunCommand(cmd, 0, true) + stepping, _, _, err = t.RunCommand(cmd) if err != nil { return "", fmt.Errorf("failed to get target CPU stepping: %v", err) } diff --git a/internal/script/script.go b/internal/script/script.go index f753fa65..e3b0faec 100644 --- a/internal/script/script.go +++ b/internal/script/script.go @@ -125,7 +125,14 @@ func RunScripts(myTarget target.Target, scripts []ScriptDefinition, ignoreScript } else { cmd = exec.Command("bash", path.Join(myTarget.GetTempDirectory(), masterScriptName)) // #nosec G204 } - stdout, stderr, exitcode, err := myTarget.RunCommand(cmd, 0, false) // don't reuse ssh connection on long-running commands, makes it difficult to kill the command + timeout := 0 // no timeout + // We run parallel_master in a new process group so that tty/terminal signals, e.g., Ctrl-C, are not sent to the command. This is + // necessary to allow the master script to handle signals itself and propagate them to the child scripts as needed. The + // signal handler in perfspect will send the signal to the parallel_master.sh script on each target so that it can clean up + // its child processes. + newProcessGroup := true + reuseSSHConnection := false // don't reuse ssh connection on long-running commands, makes it difficult to kill the command + stdout, stderr, exitcode, err := myTarget.RunCommandEx(cmd, timeout, newProcessGroup, reuseSSHConnection) if err != nil { slog.Error("error running master script on target", slog.String("stdout", stdout), slog.String("stderr", stderr), slog.Int("exitcode", exitcode), slog.String("error", err.Error())) return nil, err @@ -159,7 +166,13 @@ func RunScripts(myTarget target.Target, scripts []ScriptDefinition, ignoreScript } else { cmd = exec.Command("bash", scriptPath) // #nosec G204 } - stdout, stderr, exitcode, err := myTarget.RunCommand(cmd, 0, false) + // if the script is tagged with NeedsKill, we run it in a new process group so that tty/terminal signals, e.g., Ctrl-C, are not sent to the command. This is + // necessary to allow the script to handle signals itself and clean up as needed. The + // signal handler in perfspect will send the signal to the script on each target so that it can clean up + // as needed. + newProcessGroup := script.NeedsKill + reuseSSHConnection := false // don't reuse ssh connection on long-running commands, makes it difficult to kill the command + stdout, stderr, exitcode, err := myTarget.RunCommandEx(cmd, 0, newProcessGroup, reuseSSHConnection) if err != nil { slog.Warn("error running script on target", slog.String("name", script.Name), slog.String("stdout", stdout), slog.String("stderr", stderr), slog.Int("exitcode", exitcode), slog.String("error", err.Error())) } @@ -189,7 +202,7 @@ func RunScriptStream(myTarget target.Target, script ScriptDefinition, localTempD }() } cmd := prepareCommand(script, myTarget.GetTempDirectory()) - err = myTarget.RunCommandStream(cmd, 0, false, stdoutChannel, stderrChannel, exitcodeChannel, cmdChannel) + err = myTarget.RunCommandStream(cmd, stdoutChannel, stderrChannel, exitcodeChannel, cmdChannel) errorChannel <- err } @@ -247,6 +260,9 @@ set -o pipefail script_dir={{.TargetTempDir}} cd "$script_dir" +# write our pid to a file so that perfspect can send us a signal if needed +echo $$ > primary_collection_script.pid + declare -a scripts=() declare -A needs_kill=() declare -A pids=() @@ -318,6 +334,7 @@ handle_sigint() { kill_script "$s" done print_summary + rm -f primary_collection_script.pid exit 0 } @@ -326,6 +343,7 @@ trap handle_sigint SIGINT start_scripts wait_for_scripts print_summary +rm -f primary_collection_script.pid ` tmpl, err := template.New("master").Parse(masterScriptTemplate) if err != nil { diff --git a/internal/script/script_defs.go b/internal/script/script_defs.go index 8687f0ee..866c95b1 100644 --- a/internal/script/script_defs.go +++ b/internal/script/script_defs.go @@ -1452,36 +1452,90 @@ duration={{.Duration}} frequency={{.Frequency}} maxdepth={{.MaxDepth}} +# write our pid to a file so that perfspect can send us a signal if needed +echo $$ > primary_collection_script.pid + ap_interval=0 if [ "$frequency" -ne 0 ]; then ap_interval=$((1000000000 / frequency)) fi -# Function to restore original settings and clean up -restore_settings() { - echo "$PERF_EVENT_PARANOID" > /proc/sys/kernel/perf_event_paranoid - echo "$KPTR_RESTRICT" > /proc/sys/kernel/kptr_restrict +# Function to stop profiling +stop_profiling() { if [ -n "$perf_fp_pid" ]; then - kill -0 $perf_fp_pid 2>/dev/null && kill -INT $perf_fp_pid + kill -0 "$perf_fp_pid" 2>/dev/null && kill -INT "$perf_fp_pid" + wait "$perf_fp_pid" fi if [ -n "$perf_dwarf_pid" ]; then - kill -0 $perf_dwarf_pid 2>/dev/null && kill -INT $perf_dwarf_pid + kill -0 "$perf_dwarf_pid" 2>/dev/null && kill -INT "$perf_dwarf_pid" + wait "$perf_dwarf_pid" fi for pid in "${java_pids[@]}"; do - async-profiler/bin/asprof stop -o collapsed "$pid" + async-profiler/bin/asprof stop -o collapsed -f ap_folded_"$pid" "$pid" done + # Restore original settings + echo "$perf_event_paranoid" > /proc/sys/kernel/perf_event_paranoid + echo "$kptr_restrict" > /proc/sys/kernel/kptr_restrict +} + +# Function to collapse perf data +collapse_perf_data() { + if [ -f perf_dwarf_data ]; then + perf script -i perf_dwarf_data > perf_dwarf_stacks + stackcollapse-perf perf_dwarf_stacks > perf_dwarf_folded + else + echo "Error: perf_dwarf_data file not found" >&2 + fi + if [ -f perf_fp_data ]; then + perf script -i perf_fp_data > perf_fp_stacks + stackcollapse-perf perf_fp_stacks > perf_fp_folded + else + echo "Error: perf_fp_data file not found" >&2 + fi +} + +# Function to print results to stdout +print_results() { + echo "########## maximum depth ##########" + echo "$maxdepth" + + if [ -f perf_dwarf_folded ]; then + echo "########## perf_dwarf ##########" + cat perf_dwarf_folded + fi + if [ -f perf_fp_folded ]; then + echo "########## perf_fp ##########" + cat perf_fp_folded + fi + + for idx in "${!java_pids[@]}"; do + pid="${java_pids[$idx]}" + cmd="${java_cmds[$idx]}" + echo "########## async-profiler $pid $cmd ##########" + if [ -f ap_folded_"$pid" ]; then + cat ap_folded_"$pid" + else + echo "Error: async-profiler output file not found for PID $pid" >&2 + fi + done +} + +# Function to finalize profiling and output +finalize() { + stop_profiling + collapse_perf_data + print_results + rm -f primary_collection_script.pid + exit 0 } # Adjust perf_event_paranoid and kptr_restrict -PERF_EVENT_PARANOID=$(cat /proc/sys/kernel/perf_event_paranoid) +perf_event_paranoid=$(cat /proc/sys/kernel/perf_event_paranoid) echo -1 >/proc/sys/kernel/perf_event_paranoid -KPTR_RESTRICT=$(cat /proc/sys/kernel/kptr_restrict) +kptr_restrict=$(cat /proc/sys/kernel/kptr_restrict) echo 0 >/proc/sys/kernel/kptr_restrict -# Ensure settings are restored on exit -trap restore_settings EXIT - -# Check if at least one process is running +# If pids specified, check if at least one of them is running if [ -n "$pids" ]; then IFS=',' read -r -a pid_array <<< "$pids" for p in "${pid_array[@]}"; do @@ -1491,6 +1545,7 @@ if [ -n "$pids" ]; then fi else echo "Error: Process $p is not running." >&2 + stop_profiling exit 1 fi done @@ -1498,7 +1553,7 @@ else mapfile -t java_pids < <(pgrep java) fi -# Frame pointer mode +# Start profiling with perf in frame pointer mode if [ -n "$pids" ]; then perf record -F "$frequency" -p "$pids" -g -o perf_fp_data -m 129 & else @@ -1507,10 +1562,11 @@ fi perf_fp_pid=$! if ! kill -0 $perf_fp_pid 2>/dev/null; then echo "Failed to start perf record in frame pointer mode" >&2 + stop_profiling exit 1 fi -# Dwarf mode +# Start profiling with perf in dwarf mode if [ -n "$pids" ]; then perf record -F "$frequency" -p "$pids" -g -o perf_dwarf_data -m 257 --call-graph dwarf,8192 & else @@ -1519,79 +1575,32 @@ fi perf_dwarf_pid=$! if ! kill -0 $perf_dwarf_pid 2>/dev/null; then echo "Failed to start perf record in dwarf mode" >&2 + stop_profiling exit 1 fi -# Start Java profiling for each Java PID +# Start profiling Java with async-profiler for each Java PID for pid in "${java_pids[@]}"; do java_cmds+=("$(tr '\000' ' ' < /proc/"$pid"/cmdline)") async-profiler/bin/asprof start -i "$ap_interval" -F probesp+vtable "$pid" done -# Wait for the specified duration -sleep "$duration" +# profiling has been started, set up trap to finalize on interrupt +trap finalize INT TERM -# Stop perf recording -if ! kill -0 $perf_fp_pid 2>/dev/null; then - echo "Frame pointer mode already stopped" >&2 -else - kill -INT $perf_fp_pid -fi -if ! kill -0 $perf_dwarf_pid 2>/dev/null; then - echo "Dwarf mode already stopped" >&2 -else - kill -INT $perf_dwarf_pid -fi - -# Stop Java profiling, write output to ap_folded_ files -for pid in "${java_pids[@]}"; do - async-profiler/bin/asprof stop -o collapsed -f ap_folded_"$pid" "$pid" -done - -# Wait for perf to finish -wait ${perf_fp_pid} ${perf_dwarf_pid} - -# Collapse perf data -if [ -f perf_dwarf_data ]; then - perf script -i perf_dwarf_data > perf_dwarf_stacks - stackcollapse-perf perf_dwarf_stacks > perf_dwarf_folded -else - echo "Error: perf_dwarf_data file not found" >&2 -fi -if [ -f perf_fp_data ]; then - perf script -i perf_fp_data > perf_fp_stacks - stackcollapse-perf perf_fp_stacks > perf_fp_folded -else - echo "Error: perf_fp_data file not found" >&2 -fi - -# Dump results to stdout -echo "########## maximum depth ##########" -echo "$maxdepth" - -if [ -f perf_dwarf_folded ]; then - echo "########## perf_dwarf ##########" - cat perf_dwarf_folded -fi -if [ -f perf_fp_folded ]; then - echo "########## perf_fp ##########" - cat perf_fp_folded +# Wait for the specified duration (seconds), then wrap it up by calling finalize +if [ "$duration" -gt 0 ]; then + sleep "$duration" + finalize fi -for idx in "${!java_pids[@]}"; do - pid="${java_pids[$idx]}" - cmd="${java_cmds[$idx]}" - echo "########## async-profiler $pid $cmd ##########" - if [ -f ap_folded_"$pid" ]; then - cat ap_folded_"$pid" - else - echo "Error: async-profiler output file not found for PID $pid" >&2 - fi -done +# Wait indefinitely until interrupted, trap will call finalize +wait `, Superuser: true, Sequential: true, Depends: []string{"async-profiler", "perf", "stackcollapse-perf"}, + NeedsKill: true, }, // lock analysis scripts ProfileKernelLockScriptName: { diff --git a/internal/target/helpers.go b/internal/target/helpers.go index 241eaf16..9deebe42 100644 --- a/internal/target/helpers.go +++ b/internal/target/helpers.go @@ -11,6 +11,7 @@ import ( "log/slog" "os/exec" "strings" + "syscall" "time" ) @@ -32,7 +33,7 @@ func installLkms(t Target, lkms []string) (installedLkms []string, err error) { } for _, lkm := range lkms { slog.Debug("attempting to install kernel module", slog.String("lkm", lkm)) - _, _, _, err := t.RunCommand(exec.Command("modprobe", "--first-time", lkm), 10, true) // #nosec G204 + _, _, _, err := t.RunCommandEx(exec.Command("modprobe", "--first-time", lkm), 10, false, true) // #nosec G204 if err != nil { slog.Debug("kernel module already installed or problem installing", slog.String("lkm", lkm), slog.String("error", err.Error())) continue @@ -60,7 +61,7 @@ func uninstallLkms(t Target, lkms []string) (err error) { } for _, lkm := range lkms { slog.Debug("attempting to uninstall kernel module", slog.String("lkm", lkm)) - _, _, _, err := t.RunCommand(exec.Command("modprobe", "-r", lkm), 10, true) // #nosec G204 + _, _, _, err := t.RunCommandEx(exec.Command("modprobe", "-r", lkm), 10, false, true) // #nosec G204 if err != nil { slog.Error("error uninstalling kernel module", slog.String("lkm", lkm), slog.String("error", err.Error())) continue @@ -77,13 +78,14 @@ func uninstallLkms(t Target, lkms []string) (err error) { // - cmd: The command to execute, represented as an *exec.Cmd. // - input: A string to be passed as input to the command's standard input. // - timeout: The timeout in seconds for the command execution. If set to 0, no timeout is applied. +// - newProcessGroup: A boolean indicating whether to run the command in a new process group. // // Returns: // - stdout: The standard output of the command as a string. // - stderr: The standard error of the command as a string. // - exitCode: The exit code of the command. If the command fails to execute, this may be undefined. // - err: An error object if the command fails to execute or times out. -func runLocalCommandWithInputWithTimeout(cmd *exec.Cmd, input string, timeout int) (stdout string, stderr string, exitCode int, err error) { +func runLocalCommandWithInputWithTimeout(cmd *exec.Cmd, input string, timeout int, newProcessGroup bool) (stdout string, stderr string, exitCode int, err error) { logInput := "" if input != "" { logInput = "******" @@ -103,6 +105,10 @@ func runLocalCommandWithInputWithTimeout(cmd *exec.Cmd, input string, timeout in var outbuf, errbuf strings.Builder cmd.Stdout = &outbuf cmd.Stderr = &errbuf + if newProcessGroup { + // isolate the command in its own process group, so that signals sent to perfspect don't affect it + cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true} + } err = cmd.Run() stdout = outbuf.String() stderr = errbuf.String() @@ -203,7 +209,7 @@ func runLocalCommandWithInputWithTimeoutAsync(cmd *exec.Cmd, stdoutChannel chan // - err: An error if the command execution fails or if there is an issue retrieving the architecture. func getArchitecture(t Target) (arch string, err error) { cmd := exec.Command("uname", "-m") - arch, _, _, err = t.RunCommand(cmd, 0, true) + arch, _, _, err = t.RunCommand(cmd) if err != nil { return } diff --git a/internal/target/local_target.go b/internal/target/local_target.go index c66e94b8..36822006 100644 --- a/internal/target/local_target.go +++ b/internal/target/local_target.go @@ -20,14 +20,22 @@ func (t *LocalTarget) SetSudo(sudo string) { t.canElevate = 0 } -// RunCommand executes the given command with a timeout and returns the standard output, -// standard error, exit code, and any error that occurred. -func (t *LocalTarget) RunCommand(cmd *exec.Cmd, timeout int, argNotUsed bool) (stdout string, stderr string, exitCode int, err error) { +// RunCommand executes the given command on the local target +func (t *LocalTarget) RunCommand(cmd *exec.Cmd) (stdout string, stderr string, exitCode int, err error) { input := "" if t.sudo != "" && len(cmd.Args) > 2 && cmd.Args[0] == "sudo" && strings.HasPrefix(cmd.Args[1], "-") && strings.Contains(cmd.Args[1], "S") { // 'sudo -S' gets password from stdin input = t.sudo + "\n" } - return runLocalCommandWithInputWithTimeout(cmd, input, timeout) + return runLocalCommandWithInputWithTimeout(cmd, input, 0, false) +} + +// RunCommandEx executes a command on the local target with additional options. +func (t *LocalTarget) RunCommandEx(cmd *exec.Cmd, timeout int, newProcessGroup bool, remoteReuseSSHConnection bool) (stdout string, stderr string, exitCode int, err error) { + input := "" + if t.sudo != "" && len(cmd.Args) > 2 && cmd.Args[0] == "sudo" && strings.HasPrefix(cmd.Args[1], "-") && strings.Contains(cmd.Args[1], "S") { // 'sudo -S' gets password from stdin + input = t.sudo + "\n" + } + return runLocalCommandWithInputWithTimeout(cmd, input, timeout, newProcessGroup) } // RunCommandStream runs the given command asynchronously on the target. @@ -36,10 +44,10 @@ func (t *LocalTarget) RunCommand(cmd *exec.Cmd, timeout int, argNotUsed bool) (s // and the exit code is sent to the exitcodeChannel. // The timeout parameter specifies the maximum time allowed for the command to run. // Returns an error if there was a problem running the command. -func (t *LocalTarget) RunCommandStream(cmd *exec.Cmd, timeout int, argNotUsed bool, stdoutChannel chan []byte, stderrChannel chan []byte, exitcodeChannel chan int, cmdChannel chan *exec.Cmd) (err error) { +func (t *LocalTarget) RunCommandStream(cmd *exec.Cmd, stdoutChannel chan []byte, stderrChannel chan []byte, exitcodeChannel chan int, cmdChannel chan *exec.Cmd) (err error) { localCommand := cmd cmdChannel <- localCommand - err = runLocalCommandWithInputWithTimeoutAsync(localCommand, stdoutChannel, stderrChannel, exitcodeChannel, "", timeout) + err = runLocalCommandWithInputWithTimeoutAsync(localCommand, stdoutChannel, stderrChannel, exitcodeChannel, "", 0) return } @@ -174,14 +182,14 @@ func (t *LocalTarget) CanElevatePrivileges() bool { slog.Error("error writing sudo password", slog.String("error", err.Error())) } }() - _, _, _, err := t.RunCommand(cmd, 0, true) + _, _, _, err := t.RunCommand(cmd) if err == nil { t.canElevate = 1 return true // sudo password works } } cmd := exec.Command("sudo", "-kS", "ls") - _, _, _, err := t.RunCommand(cmd, 0, true) + _, _, _, err := t.RunCommand(cmd) if err == nil { // true - passwordless sudo works t.canElevate = 1 return true diff --git a/internal/target/remote_target.go b/internal/target/remote_target.go index 239af8c7..ffff3388 100644 --- a/internal/target/remote_target.go +++ b/internal/target/remote_target.go @@ -22,23 +22,16 @@ func (t *RemoteTarget) SetSshPass(sshPass string) { t.sshPass = sshPass } -// RunCommand executes a command on the remote target using SSH. It prepares the -// local command to be executed, optionally reusing an existing SSH connection, -// and runs it with a specified timeout. -// -// Parameters: -// - cmd: The command to be executed, represented as an *exec.Cmd. -// - timeout: The maximum duration (in seconds) to wait for the command to complete. -// - reuseSSHConnection: A boolean indicating whether to reuse an existing SSH connection. -// -// Returns: -// - stdout: The standard output of the executed command. -// - stderr: The standard error output of the executed command. -// - exitCode: The exit code returned by the command. -// - err: An error object if the command execution fails. -func (t *RemoteTarget) RunCommand(cmd *exec.Cmd, timeout int, reuseSSHConnection bool) (stdout string, stderr string, exitCode int, err error) { +// RunCommand executes a command on the remote target using SSH. +func (t *RemoteTarget) RunCommand(cmd *exec.Cmd) (stdout string, stderr string, exitCode int, err error) { + localCommand := t.prepareLocalCommand(cmd, true) + return runLocalCommandWithInputWithTimeout(localCommand, "", 0, false) +} + +// RunCommandEx executes a command on the remote target using SSH with additional options. +func (t *RemoteTarget) RunCommandEx(cmd *exec.Cmd, timeout int, newProcessGroup bool, reuseSSHConnection bool) (stdout string, stderr string, exitCode int, err error) { localCommand := t.prepareLocalCommand(cmd, reuseSSHConnection) - return runLocalCommandWithInputWithTimeout(localCommand, "", timeout) + return runLocalCommandWithInputWithTimeout(localCommand, "", timeout, newProcessGroup) } // RunCommandStream executes a command asynchronously on a remote target. @@ -48,8 +41,6 @@ func (t *RemoteTarget) RunCommand(cmd *exec.Cmd, timeout int, reuseSSHConnection // // Parameters: // - cmd: The command to be executed, represented as an *exec.Cmd. -// - timeout: The maximum duration (in seconds) to allow the command to run. -// - reuseSSHConnection: A boolean indicating whether to reuse an existing SSH connection. // - stdoutChannel: A channel to send the standard output of the command. // - stderrChannel: A channel to send the standard error of the command. // - exitcodeChannel: A channel to send the exit code of the command. @@ -57,10 +48,10 @@ func (t *RemoteTarget) RunCommand(cmd *exec.Cmd, timeout int, reuseSSHConnection // // Returns: // - err: An error object if the command fails to execute or times out. -func (t *RemoteTarget) RunCommandStream(cmd *exec.Cmd, timeout int, reuseSSHConnection bool, stdoutChannel chan []byte, stderrChannel chan []byte, exitcodeChannel chan int, cmdChannel chan *exec.Cmd) (err error) { - localCommand := t.prepareLocalCommand(cmd, reuseSSHConnection) +func (t *RemoteTarget) RunCommandStream(cmd *exec.Cmd, stdoutChannel chan []byte, stderrChannel chan []byte, exitcodeChannel chan int, cmdChannel chan *exec.Cmd) (err error) { + localCommand := t.prepareLocalCommand(cmd, false) cmdChannel <- localCommand - err = runLocalCommandWithInputWithTimeoutAsync(localCommand, stdoutChannel, stderrChannel, exitcodeChannel, "", timeout) + err = runLocalCommandWithInputWithTimeoutAsync(localCommand, stdoutChannel, stderrChannel, exitcodeChannel, "", 0) return } @@ -96,7 +87,7 @@ func (t *RemoteTarget) CreateTempDirectory(rootDir string) (tempDir string, err root = fmt.Sprintf("--tmpdir=%s", rootDir) } cmd := exec.Command("mktemp", "-d", "-t", root, "perfspect.tmp.XXXXXXXXXX", "|", "xargs", "realpath") // #nosec G204 - tempDir, _, _, err = t.RunCommand(cmd, 0, true) + tempDir, _, _, err = t.RunCommand(cmd) if err != nil { return } @@ -158,14 +149,14 @@ func (t *RemoteTarget) PullFile(srcPath string, dstDir string) error { func (t *RemoteTarget) CreateDirectory(baseDir string, targetDir string) (dir string, err error) { dir = filepath.Join(baseDir, targetDir) cmd := exec.Command("mkdir", dir) - _, _, _, err = t.RunCommand(cmd, 0, true) + _, _, _, err = t.RunCommand(cmd) return } func (t *RemoteTarget) RemoveDirectory(targetDir string) (err error) { if targetDir != "" { cmd := exec.Command("rm", "-rf", targetDir) - _, _, _, err = t.RunCommand(cmd, 0, true) + _, _, _, err = t.RunCommand(cmd) } return } @@ -173,7 +164,7 @@ func (t *RemoteTarget) RemoveDirectory(targetDir string) (err error) { // CanConnect checks if the target is reachable. func (t *RemoteTarget) CanConnect() bool { cmd := exec.Command("exit", "0") - _, _, _, err := t.RunCommand(cmd, 5, true) + _, _, _, err := t.RunCommand(cmd) return err == nil } @@ -188,7 +179,7 @@ func (t *RemoteTarget) CanElevatePrivileges() bool { return true } cmd := exec.Command("sudo", "-kS", "ls") - _, _, _, err := t.RunCommand(cmd, 0, true) + _, _, _, err := t.RunCommand(cmd) if err == nil { // true - passwordless sudo works t.canElevate = 1 return true @@ -219,7 +210,7 @@ func (t *RemoteTarget) GetName() (host string) { func (t *RemoteTarget) GetUserPath() (string, error) { if t.userPath == "" { cmd := exec.Command("echo", "$PATH") - stdout, _, _, err := t.RunCommand(cmd, 0, true) + stdout, _, _, err := t.RunCommand(cmd) if err != nil { return "", err } @@ -449,6 +440,6 @@ func (t *RemoteTarget) prepareAndRunSCPCommand(srcPath string, dstDir string, is if usePass { localCommand.Env = append(localCommand.Env, "SSHPASS="+t.sshPass) } - stdout, stderr, exitCode, err = runLocalCommandWithInputWithTimeout(localCommand, "", 0) + stdout, stderr, exitCode, err = runLocalCommandWithInputWithTimeout(localCommand, "", 0, false) return } diff --git a/internal/target/target.go b/internal/target/target.go index 1b7883f5..6533436a 100644 --- a/internal/target/target.go +++ b/internal/target/target.go @@ -65,22 +65,30 @@ type Target interface { // RunCommand runs the specified command on the target. // Arguments: // - cmd: the command to run + // It returns the standard output, standard error, exit code, and any error that occurred. + RunCommand(cmd *exec.Cmd) (stdout string, stderr string, exitCode int, err error) + + // RunCommandEx runs the specified command on the target with additional options. + // Arguments: + // - cmd: the command to run // - timeout: the maximum time allowed for the command to run (zero means no timeout) - // - reuseSSHConnection: whether to reuse the SSH connection for the command (only relevant for RemoteTarget) + // - newProcessGroup: whether to run the command in a new process group. When true, the + // command is isolated from terminal and parent-process signals (for example, Ctrl-C), + // which is useful for long-running or background workloads that should not be + // interrupted when the caller's terminal session receives a signal. + // - remoteReuseSSHConnection: whether to reuse the SSH connection for the command (only relevant for RemoteTarget) // It returns the standard output, standard error, exit code, and any error that occurred. - RunCommand(cmd *exec.Cmd, timeout int, reuseSSHConnection bool) (stdout string, stderr string, exitCode int, err error) + RunCommandEx(cmd *exec.Cmd, timeout int, newProcessGroup bool, remoteReuseSSHConnection bool) (stdout string, stderr string, exitCode int, err error) // RunCommandStream runs the specified command on the target and streams the output to the provided channels. // Arguments: // - cmd: the command to run - // - timeout: the maximum time allowed for the command to run (zero means no timeout) - // - reuseSSHConnection: whether to reuse the SSH connection for the command (only relevant for RemoteTarget) // - stdoutChannel: a channel to send the standard output of the command // - stderrChannel: a channel to send the standard error of the command // - exitcodeChannel: a channel to send the exit code of the command // - cmdChannel: a channel to send the command that was run // It returns any error that occurred. - RunCommandStream(cmd *exec.Cmd, timeout int, reuseSSHConnection bool, stdoutChannel chan []byte, stderrChannel chan []byte, exitcodeChannel chan int, cmdChannel chan *exec.Cmd) error + RunCommandStream(cmd *exec.Cmd, stdoutChannel chan []byte, stderrChannel chan []byte, exitcodeChannel chan int, cmdChannel chan *exec.Cmd) error // PushFile transfers a file from the local system to the target. // It returns any error that occurred.