From 4a2c1398253dfd08cb167d9b3b27f4aaa37d61ca Mon Sep 17 00:00:00 2001 From: Gyan Ranjan Panda Date: Mon, 23 Feb 2026 14:41:09 +0530 Subject: [PATCH] SPDX: Replace deprecated documentDescribes with DESCRIBES relationships MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit While working on SPDX output, I noticed we're still using the documentDescribes field even though it was deprecated in SPDX 2.3 in favour of explicit DESCRIBES relationships in the relationships array. This removes that field from serialization and instead writes equivalent DESCRIBES relationships, prepended so document-level entries appear first. Reading is still backward compatible — if an older document uses documentDescribes, it's transparently converted when parsed. Updated the asgiref fixture and added two new tests: one that confirms the deprecated field no longer appears in output, and one that verifies older documents round-trip correctly to the new format. Signed-off-by: Gyan Ranjan Panda --- scanpipe/pipes/spdx.py | 56 +++++++++++++---- .../data/asgiref/asgiref-3.3.0.spdx.json | 8 ++- scanpipe/tests/pipes/test_spdx.py | 62 ++++++++++++++++++- 3 files changed, 110 insertions(+), 16 deletions(-) diff --git a/scanpipe/pipes/spdx.py b/scanpipe/pipes/spdx.py index 5bb3e6bb16..23b3e6041e 100644 --- a/scanpipe/pipes/spdx.py +++ b/scanpipe/pipes/spdx.py @@ -607,7 +607,6 @@ def as_dict(self): "SPDXID": self.spdx_id, "name": self.safe_document_name(self.name), "documentNamespace": self.namespace, - "documentDescribes": self.describes, "creationInfo": self.creation_info.as_dict(), "packages": [package.as_dict(self.version) for package in self.packages], } @@ -620,10 +619,24 @@ def as_dict(self): license_info.as_dict() for license_info in self.extracted_licenses ] - if self.relationships: - data["relationships"] = [ - relationship.as_dict() for relationship in self.relationships - ] + # The SPDX 2.3 spec deprecated the top-level `documentDescribes` field. + # Instead, emit explicit DESCRIBES relationships prepended to maintain + # deterministic ordering and ensure document-level relationships appear first. + # See https://github.com/spdx/spdx-spec/issues/395 + describes_relationships = [ + { + "spdxElementId": self.spdx_id, + "relatedSpdxElement": spdx_id, + "relationshipType": "DESCRIBES", + } + for spdx_id in (self.describes or []) + ] + existing_relationships = [ + relationship.as_dict() for relationship in (self.relationships or []) + ] + all_relationships = describes_relationships + existing_relationships + if all_relationships: + data["relationships"] = all_relationships if self.comment: data["comment"] = self.comment @@ -636,13 +649,37 @@ def as_json(self, indent=2): @classmethod def from_data(cls, data): + spdx_id = data.get("SPDXID") + relationships_data = data.get("relationships", []) + + # Backward compatibility: reconstruct `describes` from the legacy + # `documentDescribes` field if present, or derive it from DESCRIBES + # relationships where spdxElementId matches the document SPDXID. + describes = data.get("documentDescribes") or [ + r["relatedSpdxElement"] + for r in relationships_data + if r.get("relationshipType") == "DESCRIBES" + and r.get("spdxElementId") == spdx_id + ] + + # Exclude DESCRIBES relationships that were re-derived into `describes` + # to avoid duplication when the document is round-tripped via as_dict(). + other_relationships = [ + Relationship.from_data(r) + for r in relationships_data + if not ( + r.get("relationshipType") == "DESCRIBES" + and r.get("spdxElementId") == spdx_id + ) + ] + return cls( - spdx_id=data.get("SPDXID"), + spdx_id=spdx_id, version=data.get("spdxVersion", "").split("SPDX-")[-1], data_license=data.get("dataLicense"), name=data.get("name"), namespace=data.get("documentNamespace"), - describes=data.get("documentDescribes"), + describes=describes, creation_info=CreationInfo.from_data(data.get("creationInfo", {})), packages=[ Package.from_data(package_data) @@ -653,10 +690,7 @@ def from_data(cls, data): ExtractedLicensingInfo.from_data(license_info_data) for license_info_data in data.get("hasExtractedLicensingInfos", []) ], - relationships=[ - Relationship.from_data(relationship_data) - for relationship_data in data.get("relationships", []) - ], + relationships=other_relationships, comment=data.get("comment"), ) diff --git a/scanpipe/tests/data/asgiref/asgiref-3.3.0.spdx.json b/scanpipe/tests/data/asgiref/asgiref-3.3.0.spdx.json index 565e2f4506..28ee50be87 100644 --- a/scanpipe/tests/data/asgiref/asgiref-3.3.0.spdx.json +++ b/scanpipe/tests/data/asgiref/asgiref-3.3.0.spdx.json @@ -4,9 +4,6 @@ "SPDXID": "SPDXRef-DOCUMENT-92fe63d9-1d53-4b63-b19a-85022fb7a3f3", "name": "scancodeio_asgiref", "documentNamespace": "https://scancode.io/spdxdocs/92fe63d9-1d53-4b63-b19a-85022fb7a3f3", - "documentDescribes": [ - "SPDXRef-scancodeio-project-92fe63d9-1d53-4b63-b19a-85022fb7a3f3" - ], "creationInfo": { "created": "2000-01-01T01:02:03Z", "creators": [ @@ -131,6 +128,11 @@ ], "files": [], "relationships": [ + { + "spdxElementId": "SPDXRef-DOCUMENT-92fe63d9-1d53-4b63-b19a-85022fb7a3f3", + "relatedSpdxElement": "SPDXRef-scancodeio-project-92fe63d9-1d53-4b63-b19a-85022fb7a3f3", + "relationshipType": "DESCRIBES" + }, { "spdxElementId": "SPDXRef-scancodeio-project-92fe63d9-1d53-4b63-b19a-85022fb7a3f3", "relatedSpdxElement": "SPDXRef-scancodeio-discoveredpackage-543a3583-3a13-4b5d-a039-c6bc4072de35", diff --git a/scanpipe/tests/pipes/test_spdx.py b/scanpipe/tests/pipes/test_spdx.py index befa12913a..0d01b70a63 100644 --- a/scanpipe/tests/pipes/test_spdx.py +++ b/scanpipe/tests/pipes/test_spdx.py @@ -196,7 +196,6 @@ def setUp(self): "SPDXID": "SPDXRef-DOCUMENT", "name": "document_name", "documentNamespace": "https://[CreatorWebsite]/[DocumentName]-[UUID]", - "documentDescribes": ["SPDXRef-project"], "creationInfo": { "created": "2022-09-21T13:50:20Z", "creators": [ @@ -272,11 +271,16 @@ def setUp(self): } ], "relationships": [ + { + "spdxElementId": "SPDXRef-DOCUMENT", + "relatedSpdxElement": "SPDXRef-project", + "relationshipType": "DESCRIBES", + }, { "spdxElementId": "SPDXRef-package1", "relatedSpdxElement": "SPDXRef-file1", "relationshipType": "CONTAINS", - } + }, ], "comment": "This document was created using SPDXCode-1.0", } @@ -412,3 +416,57 @@ def test_spdx_validate_document(self): with self.assertRaises(Exception): spdx.validate_document({}, self.schema_2_3) + + def test_spdx_document_describes_uses_relationship(self): + """documentDescribes is removed; equivalent DESCRIBES relationships are emitted.""" + document = spdx.Document(**self.document_data) + result = document.as_dict() + + assert "documentDescribes" not in result + + describes_rels = [ + r + for r in result.get("relationships", []) + if r.get("relationshipType") == "DESCRIBES" + ] + assert len(describes_rels) == 1 + assert describes_rels[0]["spdxElementId"] == "SPDXRef-DOCUMENT" + assert describes_rels[0]["relatedSpdxElement"] == "SPDXRef-project" + + def test_spdx_document_from_data_backward_compat(self): + """Legacy documentDescribes input round-trips correctly to DESCRIBES relationships.""" + legacy_data = { + "spdxVersion": "SPDX-2.3", + "dataLicense": "CC0-1.0", + "SPDXID": "SPDXRef-DOCUMENT", + "name": "legacy_doc", + "documentNamespace": "https://example.com/legacy", + "documentDescribes": ["SPDXRef-root"], + "creationInfo": { + "created": "2022-01-01T00:00:00Z", + "creators": ["Tool: OldTool-1.0"], + }, + "packages": [ + { + "SPDXID": "SPDXRef-root", + "name": "root-pkg", + "downloadLocation": "NOASSERTION", + "filesAnalyzed": False, + } + ], + } + document = spdx.Document.from_data(legacy_data) + + # Internal describes is reconstructed correctly + assert document.describes == ["SPDXRef-root"] + + # Re-serialized output uses relationships, not the legacy field + result = document.as_dict() + assert "documentDescribes" not in result + describes_rels = [ + r + for r in result.get("relationships", []) + if r.get("relationshipType") == "DESCRIBES" + ] + assert len(describes_rels) == 1 + assert describes_rels[0]["relatedSpdxElement"] == "SPDXRef-root"