aboutcode-org · uttam282005 · Mar 2, 2026 · Mar 3, 2026 · Mar 3, 2026 · Mar 3, 2026
diff --git a/scanpipe/api/views.py b/scanpipe/api/views.py
@@ -178,21 +178,9 @@ def results_download(self, request, *args, **kwargs):
 
         if format == "json":
             return project_results_json_response(project, as_attachment=True)
-        elif format == "xlsx":
-            output_file = output.to_xlsx(project)
-        elif format == "spdx":
-            output_file = output.to_spdx(project, **output_kwargs)
-        elif format == "cyclonedx":
-            output_file = output.to_cyclonedx(project, **output_kwargs)
-        elif format == "attribution":
-            output_file = output.to_attribution(project)
-        elif format == "ort-package-list":
-            output_file = output.to_ort_package_list_yml(project)
-        elif format == "all_formats":
-            output_file = output.to_all_formats(project)
-        elif format == "all_outputs":
-            output_file = output.to_all_outputs(project)
-        else:
+        try:
+            output_file = output.get_output_for_format(project, format, **output_kwargs)
+        except ValueError:
             return ErrorResponse(f"Format {format} not supported.")
 
         filename = output.safe_filename(f"scancodeio_{project.name}_{output_file.name}")

diff --git a/scanpipe/pipes/output.py b/scanpipe/pipes/output.py
@@ -177,6 +177,22 @@ def to_csv(project):
     return output_files
 
 
+def strip_symbols(data):
+    """
+    Remove symbol-related keys from general project output.
+
+    Returns a new dict with symbols removed, leaving the input unchanged.
+    """
+    if not isinstance(data, dict) or not data:
+        return data
+
+    data_copy = dict(data)
+    for key in ("source_symbols", "source_strings", "source_comments"):
+        data_copy.pop(key, None)
+
+    return data_copy
+
+
 class JSONResultsGenerator:
     """
     Return the `project` JSON results as a Python generator.
@@ -236,6 +252,12 @@ def get_headers(self, project):
 
         messages = project.projectmessages.all()
         messages = ProjectMessageSerializer(messages, many=True)
+        sanitized_messages = []
+        for msg in messages.data:
+            msg_copy = msg
+            if "details" in msg_copy:
+                msg_copy["details"] = strip_symbols(msg_copy.get("details"))
+            sanitized_messages.append(msg_copy)
 
         other_tools = [f"pkg:pypi/scancode-toolkit@{scancode_toolkit_version}"]
 
@@ -250,15 +272,18 @@ def get_headers(self, project):
             "settings": project.settings,
             "input_sources": project.get_inputs_with_source(),
             "runs": runs.data,
-            "messages": messages.data,
+            "messages": sanitized_messages,
             "extra_data": project.extra_data,
         }
         yield self.encode(headers)
 
     def encode_queryset(self, project, model_name, serializer):
         queryset = get_queryset(project, model_name)
         for obj in queryset.iterator(chunk_size=2000):
-            yield self.encode(serializer(obj).data)
+            data = serializer(obj).data
+            if model_name == "codebaseresource" and "extra_data" in data:
+                data["extra_data"] = strip_symbols(data.get("extra_data"))
+            yield self.encode(data)
 
     def get_packages(self, project):
         from scanpipe.api.serializers import DiscoveredPackageSerializer
@@ -289,7 +314,41 @@ def get_relations(self, project):
         )
 
 
-def to_json(project):
+class SymbolsJSONResultsGenerator(JSONResultsGenerator):
+    """
+    Specialized JSON generator that only outputs resources with symbol data.
+    Inherits header generation from parent, but filters resources.
+    """
+
+    def get_resources(self, project):
+        from scanpipe.api.serializers import CodebaseResourceSerializer
+
+        queryset = project.codebaseresources.all()
+        for resource in queryset.iterator(chunk_size=2000):
+            extra_data = resource.extra_data or {}
+            has_symbols = any(
+                key in extra_data
+                for key in ("source_symbols", "source_strings", "source_comments")
+            )
+            if has_symbols:
+                yield self.encode(CodebaseResourceSerializer(resource).data)
+
+    def __iter__(self):
+        yield '{"headers":['
+        for chunk in self.get_headers(self.project):
+            yield chunk
+        yield '],"files":['
+
+        first = True
+        for chunk in self.get_resources(self.project):
+            if not first:
+                yield ","
+            first = False
+            yield chunk
+        yield "]}"
+
+
+def to_json(project, **kwargs):
     """
     Generate output for the provided `project` in JSON format.
     The output file is created in the `project` output/ directory.
@@ -387,6 +446,9 @@ def add_xlsx_worksheet(workbook, worksheet_name, rows, fields):
     for row_index, record in enumerate(rows, start=1):
         row_errors = []
         record_is_dict = isinstance(record, dict)
+        record_is_projectmessage = not isinstance(record, dict) and isinstance(
+            record, ProjectMessage
+        )
         for col_index, field in enumerate(fields):
             if record_is_dict:
                 value = record.get(field)
@@ -396,6 +458,8 @@ def add_xlsx_worksheet(workbook, worksheet_name, rows, fields):
             if not value:
                 continue
 
+            if record_is_projectmessage and field == "details":
+                value = strip_symbols(value)
             value, error = _adapt_value_for_xlsx(field, value)
 
             if error:
@@ -532,7 +596,7 @@ def _adapt_value_for_xlsx(fieldname, value, maximum_length=32767, _adapt=True):
 ]
 
 
-def to_xlsx(project):
+def to_xlsx(project, **kwargs):
     """
     Generate output for the provided ``project`` in XLSX format.
     The output file is created in the ``project`` "output/" directory.
@@ -1079,7 +1143,7 @@ def get_unique_licenses(packages):
     return licenses
 
 
-def to_attribution(project):
+def to_attribution(project, **kwargs):
     """
     Generate attribution for the provided ``project``.
     The output file is created in the ``project`` "output/" directory.
@@ -1120,7 +1184,7 @@ def to_attribution(project):
     return output_file
 
 
-def to_ort_package_list_yml(project):
+def to_ort_package_list_yml(project, **kwargs):
     """
     Generate an ORT compatible "package-list.yml" output.
     The output file is created in the ``project`` "output/" directory.
@@ -1132,13 +1196,34 @@ def to_ort_package_list_yml(project):
     return output_file
 
 
+def to_symbols_json(project, **kwargs):
+    """
+    Generate a symbols-only JSON output for the provided `project`.
+
+    This output contains only the symbols-related extracted data stored on
+    CodebaseResource.extra_data:
+      - source_symbols
+      - source_strings
+      - source_comments
+    """
+    results_generator = SymbolsJSONResultsGenerator(project)
+    output_file = project.get_output_file_path("symbols", "json")
+
+    with output_file.open("w") as file:
+        for chunk in results_generator:
+            file.write(chunk)
+
+    return output_file
+
+
 FORMAT_TO_FUNCTION_MAPPING = {
     "json": to_json,
     "xlsx": to_xlsx,
     "spdx": to_spdx,
     "cyclonedx": to_cyclonedx,
     "attribution": to_attribution,
     "ort-package-list": to_ort_package_list_yml,
+    "symbols": to_symbols_json,
 }
 
 
@@ -1180,3 +1265,24 @@ def to_all_outputs(project):
     zip_file.name = safe_filename(f"{project.name}_outputs.zip")
 
     return zip_file
+
+
+def get_output_for_format(project, format, version=None):
+    """
+    Generate output file for the given format.
+    Returns the output file object.
+    """
+    output_kwargs = {}
+    if version:
+        output_kwargs["version"] = version
+
+    if format == "all_formats":
+        return to_all_formats(project)
+    if format == "all_outputs":
+        return to_all_outputs(project)
+
+    output_function = FORMAT_TO_FUNCTION_MAPPING.get(format)
+    if output_function:
+        return output_function(project, **output_kwargs)
+
+    raise ValueError(f"Unsupported output format: {format}")