Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 3 additions & 15 deletions scanpipe/api/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,21 +178,9 @@ def results_download(self, request, *args, **kwargs):

if format == "json":
return project_results_json_response(project, as_attachment=True)
elif format == "xlsx":
output_file = output.to_xlsx(project)
elif format == "spdx":
output_file = output.to_spdx(project, **output_kwargs)
elif format == "cyclonedx":
output_file = output.to_cyclonedx(project, **output_kwargs)
elif format == "attribution":
output_file = output.to_attribution(project)
elif format == "ort-package-list":
output_file = output.to_ort_package_list_yml(project)
elif format == "all_formats":
output_file = output.to_all_formats(project)
elif format == "all_outputs":
output_file = output.to_all_outputs(project)
else:
try:
output_file = output.get_output_for_format(project, format, **output_kwargs)
except ValueError:
return ErrorResponse(f"Format {format} not supported.")

filename = output.safe_filename(f"scancodeio_{project.name}_{output_file.name}")
Expand Down
118 changes: 112 additions & 6 deletions scanpipe/pipes/output.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,22 @@ def to_csv(project):
return output_files


def strip_symbols(data):
"""
Remove symbol-related keys from general project output.

Returns a new dict with symbols removed, leaving the input unchanged.
"""
if not isinstance(data, dict) or not data:
return data

data_copy = dict(data)
for key in ("source_symbols", "source_strings", "source_comments"):
data_copy.pop(key, None)

return data_copy


class JSONResultsGenerator:
"""
Return the `project` JSON results as a Python generator.
Expand Down Expand Up @@ -236,6 +252,12 @@ def get_headers(self, project):

messages = project.projectmessages.all()
messages = ProjectMessageSerializer(messages, many=True)
sanitized_messages = []
for msg in messages.data:
msg_copy = msg
if "details" in msg_copy:
msg_copy["details"] = strip_symbols(msg_copy.get("details"))
sanitized_messages.append(msg_copy)

other_tools = [f"pkg:pypi/scancode-toolkit@{scancode_toolkit_version}"]

Expand All @@ -250,15 +272,18 @@ def get_headers(self, project):
"settings": project.settings,
"input_sources": project.get_inputs_with_source(),
"runs": runs.data,
"messages": messages.data,
"messages": sanitized_messages,
"extra_data": project.extra_data,
}
yield self.encode(headers)

def encode_queryset(self, project, model_name, serializer):
queryset = get_queryset(project, model_name)
for obj in queryset.iterator(chunk_size=2000):
yield self.encode(serializer(obj).data)
data = serializer(obj).data
if model_name == "codebaseresource" and "extra_data" in data:
data["extra_data"] = strip_symbols(data.get("extra_data"))
yield self.encode(data)

def get_packages(self, project):
from scanpipe.api.serializers import DiscoveredPackageSerializer
Expand Down Expand Up @@ -289,7 +314,41 @@ def get_relations(self, project):
)


def to_json(project):
class SymbolsJSONResultsGenerator(JSONResultsGenerator):
"""
Specialized JSON generator that only outputs resources with symbol data.
Inherits header generation from parent, but filters resources.
"""

def get_resources(self, project):
from scanpipe.api.serializers import CodebaseResourceSerializer

queryset = project.codebaseresources.all()
for resource in queryset.iterator(chunk_size=2000):
extra_data = resource.extra_data or {}
has_symbols = any(
key in extra_data
for key in ("source_symbols", "source_strings", "source_comments")
)
if has_symbols:
yield self.encode(CodebaseResourceSerializer(resource).data)

def __iter__(self):
yield '{"headers":['
for chunk in self.get_headers(self.project):
yield chunk
yield '],"files":['

first = True
for chunk in self.get_resources(self.project):
if not first:
yield ","
first = False
yield chunk
yield "]}"


def to_json(project, **kwargs):
"""
Generate output for the provided `project` in JSON format.
The output file is created in the `project` output/ directory.
Expand Down Expand Up @@ -387,6 +446,9 @@ def add_xlsx_worksheet(workbook, worksheet_name, rows, fields):
for row_index, record in enumerate(rows, start=1):
row_errors = []
record_is_dict = isinstance(record, dict)
record_is_projectmessage = not isinstance(record, dict) and isinstance(
record, ProjectMessage
)
for col_index, field in enumerate(fields):
if record_is_dict:
value = record.get(field)
Expand All @@ -396,6 +458,8 @@ def add_xlsx_worksheet(workbook, worksheet_name, rows, fields):
if not value:
continue

if record_is_projectmessage and field == "details":
value = strip_symbols(value)
value, error = _adapt_value_for_xlsx(field, value)

if error:
Expand Down Expand Up @@ -532,7 +596,7 @@ def _adapt_value_for_xlsx(fieldname, value, maximum_length=32767, _adapt=True):
]


def to_xlsx(project):
def to_xlsx(project, **kwargs):
"""
Generate output for the provided ``project`` in XLSX format.
The output file is created in the ``project`` "output/" directory.
Expand Down Expand Up @@ -1079,7 +1143,7 @@ def get_unique_licenses(packages):
return licenses


def to_attribution(project):
def to_attribution(project, **kwargs):
"""
Generate attribution for the provided ``project``.
The output file is created in the ``project`` "output/" directory.
Expand Down Expand Up @@ -1120,7 +1184,7 @@ def to_attribution(project):
return output_file


def to_ort_package_list_yml(project):
def to_ort_package_list_yml(project, **kwargs):
"""
Generate an ORT compatible "package-list.yml" output.
The output file is created in the ``project`` "output/" directory.
Expand All @@ -1132,13 +1196,34 @@ def to_ort_package_list_yml(project):
return output_file


def to_symbols_json(project, **kwargs):
"""
Generate a symbols-only JSON output for the provided `project`.

This output contains only the symbols-related extracted data stored on
CodebaseResource.extra_data:
- source_symbols
- source_strings
- source_comments
"""
results_generator = SymbolsJSONResultsGenerator(project)
output_file = project.get_output_file_path("symbols", "json")

with output_file.open("w") as file:
for chunk in results_generator:
file.write(chunk)

return output_file


FORMAT_TO_FUNCTION_MAPPING = {
"json": to_json,
"xlsx": to_xlsx,
"spdx": to_spdx,
"cyclonedx": to_cyclonedx,
"attribution": to_attribution,
"ort-package-list": to_ort_package_list_yml,
"symbols": to_symbols_json,
}


Expand Down Expand Up @@ -1180,3 +1265,24 @@ def to_all_outputs(project):
zip_file.name = safe_filename(f"{project.name}_outputs.zip")

return zip_file


def get_output_for_format(project, format, version=None):
"""
Generate output file for the given format.
Returns the output file object.
"""
output_kwargs = {}
if version:
output_kwargs["version"] = version

if format == "all_formats":
return to_all_formats(project)
if format == "all_outputs":
return to_all_outputs(project)

output_function = FORMAT_TO_FUNCTION_MAPPING.get(format)
if output_function:
return output_function(project, **output_kwargs)

raise ValueError(f"Unsupported output format: {format}")
Loading
Loading