Browse Source

feat: dedup sbom but keep it monolithic (#6004)

Co-authored-by: Gergely Brautigam <182850+Skarlso@users.noreply.github.com>
Co-authored-by: Gergely Bräutigam <gergely.brautigam@sap.com>
Moritz Johner 1 month ago
parent
commit
d2220b82ad
2 changed files with 179 additions and 2 deletions
  1. 58 2
      .github/actions/sign/action.yml
  2. 121 0
      hack/dedupe-spdx-gomod.sh

+ 58 - 2
.github/actions/sign/action.yml

@@ -87,10 +87,38 @@ runs:
         # Image SBOM (OS + application libs contained in the image)
         echo "Generating image SBOM for ${IMAGE_NAME}@${CONTAINER_DIGEST}"
         syft "${IMAGE_NAME}@${CONTAINER_DIGEST}" -o spdx-json=sbom.${IMAGE_TAG}.spdx.json
+        ORIGINAL_IMAGE_SBOM_SIZE="$(wc -c < sbom.${IMAGE_TAG}.spdx.json)"
+        echo "Original image SBOM size: ${ORIGINAL_IMAGE_SBOM_SIZE} bytes"
+
+        MAX_SBOM_SIZE_BYTES=10000000
+
+        echo "Deduplicating image SPDX package nodes and relationships"
+        bash ./hack/dedupe-spdx-gomod.sh \
+          --input sbom.${IMAGE_TAG}.spdx.json \
+          --output sbom.${IMAGE_TAG}.dedup.spdx.json
+
+        DEDUP_IMAGE_SBOM_SIZE="$(wc -c < sbom.${IMAGE_TAG}.dedup.spdx.json)"
+        echo "Deduplicated image SBOM size: ${DEDUP_IMAGE_SBOM_SIZE} bytes"
+
+        if [[ "${DEDUP_IMAGE_SBOM_SIZE}" -gt "${MAX_SBOM_SIZE_BYTES}" ]]; then
+          echo "Deduped image SBOM still above ${MAX_SBOM_SIZE_BYTES} bytes, dropping file ownership data"
+          bash ./hack/dedupe-spdx-gomod.sh \
+            --input sbom.${IMAGE_TAG}.spdx.json \
+            --output sbom.${IMAGE_TAG}.dedup.spdx.json \
+            --drop-file-ownership
+          DEDUP_IMAGE_SBOM_SIZE="$(wc -c < sbom.${IMAGE_TAG}.dedup.spdx.json)"
+          echo "Ownership-pruned deduplicated image SBOM size: ${DEDUP_IMAGE_SBOM_SIZE} bytes"
+        fi
+
+        if [[ "${DEDUP_IMAGE_SBOM_SIZE}" -gt "${MAX_SBOM_SIZE_BYTES}" ]]; then
+          echo "Image SBOM predicate is still too large (${DEDUP_IMAGE_SBOM_SIZE} bytes)."
+          echo "Refusing attestation to avoid Rekor submission retries/failure."
+          exit 1
+        fi
         echo "::endgroup::"
 
         echo "::group::Attest image SBOM"
-        cosign attest --yes --new-bundle-format=false --use-signing-config=false --predicate sbom.${IMAGE_TAG}.spdx.json --type spdx "${IMAGE_NAME}@${CONTAINER_DIGEST}"
+        cosign attest --yes --new-bundle-format=false --use-signing-config=false --predicate sbom.${IMAGE_TAG}.dedup.spdx.json --type spdx "${IMAGE_NAME}@${CONTAINER_DIGEST}"
         echo "::endgroup::"
 
         echo "::group::Verify image SBOM attestation"
@@ -104,10 +132,38 @@ runs:
         # Go modules SBOM (dependencies from the source tree)
         # Requires repository to be checked out before this composite action runs.
         syft dir:. -o spdx-json=sbom.gomod.${IMAGE_TAG}.spdx.json
+        ORIGINAL_GOMOD_SBOM_SIZE="$(wc -c < sbom.gomod.${IMAGE_TAG}.spdx.json)"
+        echo "Original Go modules SBOM size: ${ORIGINAL_GOMOD_SBOM_SIZE} bytes"
+
+        echo "Deduplicating Go modules SPDX package nodes and relationships"
+        bash ./hack/dedupe-spdx-gomod.sh \
+          --input sbom.gomod.${IMAGE_TAG}.spdx.json \
+          --output sbom.gomod.${IMAGE_TAG}.dedup.spdx.json
+
+        DEDUP_GOMOD_SBOM_SIZE="$(wc -c < sbom.gomod.${IMAGE_TAG}.dedup.spdx.json)"
+        echo "Deduplicated Go modules SBOM size: ${DEDUP_GOMOD_SBOM_SIZE} bytes"
+
+        # Rekor requests can fail when predicates are too large. If the deduped
+        # SBOM is still big, drop file ownership-heavy data and re-check size.
+        if [[ "${DEDUP_GOMOD_SBOM_SIZE}" -gt "${MAX_SBOM_SIZE_BYTES}" ]]; then
+          echo "Deduped SBOM still above ${MAX_SBOM_SIZE_BYTES} bytes, dropping file ownership data"
+          bash ./hack/dedupe-spdx-gomod.sh \
+            --input sbom.gomod.${IMAGE_TAG}.spdx.json \
+            --output sbom.gomod.${IMAGE_TAG}.dedup.spdx.json \
+            --drop-file-ownership
+          DEDUP_GOMOD_SBOM_SIZE="$(wc -c < sbom.gomod.${IMAGE_TAG}.dedup.spdx.json)"
+          echo "Ownership-pruned deduplicated Go modules SBOM size: ${DEDUP_GOMOD_SBOM_SIZE} bytes"
+        fi
+
+        if [[ "${DEDUP_GOMOD_SBOM_SIZE}" -gt "${MAX_SBOM_SIZE_BYTES}" ]]; then
+          echo "Go modules SBOM predicate is still too large (${DEDUP_GOMOD_SBOM_SIZE} bytes)."
+          echo "Refusing attestation to avoid Rekor submission retries/failure."
+          exit 1
+        fi
         echo "::endgroup::"
 
         echo "::group::Attest Go modules SBOM"
-        cosign attest --yes --new-bundle-format=false --use-signing-config=false --predicate sbom.gomod.${IMAGE_TAG}.spdx.json --type spdx "${IMAGE_NAME}@${CONTAINER_DIGEST}"
+        cosign attest --yes --new-bundle-format=false --use-signing-config=false --predicate sbom.gomod.${IMAGE_TAG}.dedup.spdx.json --type spdx "${IMAGE_NAME}@${CONTAINER_DIGEST}"
         echo "::endgroup::"
 
         echo "::group::Verify Go modules SBOM attestation"

+ 121 - 0
hack/dedupe-spdx-gomod.sh

@@ -0,0 +1,121 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+usage() {
+  cat <<'EOF'
+Usage:
+  dedupe-spdx-gomod.sh --input INPUT_SPDX_JSON --output OUTPUT_SPDX_JSON [--drop-file-ownership]
+
+Description:
+  Deduplicates SPDX package nodes by purl (fallback: name@version), rewrites
+  relationships to canonical SPDX IDs, and deduplicates relationships.
+
+  Optional flag --drop-file-ownership removes file ownership-heavy data:
+  - drops relationshipType OTHER
+  - removes files[] entries
+EOF
+}
+
+INPUT=""
+OUTPUT=""
+DROP_FILE_OWNERSHIP="false"
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --input)
+      if [[ -z "${2:-}" || "${2:-}" == --* || "${2:-}" == -* ]]; then
+        echo "Error: --input requires a value" >&2
+        usage >&2
+        exit 1
+      fi
+      INPUT="${2:-}"
+      shift 2
+      ;;
+    --output)
+      if [[ -z "${2:-}" || "${2:-}" == --* || "${2:-}" == -* ]]; then
+        echo "Error: --output requires a value" >&2
+        usage >&2
+        exit 1
+      fi
+      OUTPUT="${2:-}"
+      shift 2
+      ;;
+    --drop-file-ownership)
+      DROP_FILE_OWNERSHIP="true"
+      shift
+      ;;
+    -h|--help)
+      usage
+      exit 0
+      ;;
+    *)
+      echo "Unknown argument: $1" >&2
+      usage >&2
+      exit 1
+      ;;
+  esac
+done
+
+if [[ -z "${INPUT}" || -z "${OUTPUT}" ]]; then
+  usage >&2
+  exit 1
+fi
+
+if [[ ! -f "${INPUT}" ]]; then
+  echo "Input file not found: ${INPUT}" >&2
+  exit 1
+fi
+
+TMP_OUT="$(mktemp)"
+
+jq \
+  --argjson drop_file_ownership "${DROP_FILE_OWNERSHIP}" '
+  def purl:
+    ((.externalRefs // [])
+      | map(select(.referenceType == "purl") | .referenceLocator)
+      | first);
+  def package_key:
+    if (purl // "") != "" then
+      "purl|" + purl
+    else
+      # No purl means identity is uncertain across ecosystems/catalogers.
+      # Keep the key non-destructive by including provenance fields and SPDXID.
+      "nopurl|spdxid=" + (.SPDXID // "") +
+      "|name=" + (.name // "") +
+      "|version=" + (.versionInfo // "") +
+      "|supplier=" + (.supplier // "") +
+      "|sourceInfo=" + (.sourceInfo // "")
+    end;
+
+  . as $doc
+  | ($doc.packages // [] | map(. + {__dedupe_key: package_key})) as $pkgs
+  | ($pkgs
+      | sort_by(.__dedupe_key)
+      | group_by(.__dedupe_key)
+      | map({
+          canonical_spdxid: .[0].SPDXID,
+          all_spdxids: map(.SPDXID),
+          canonical_pkg: (.[0] | del(.__dedupe_key))
+        })) as $groups
+  | ($groups | map(.canonical_pkg)) as $new_packages
+  | ($groups | map(.all_spdxids[] as $old | {($old): .canonical_spdxid}) | add // {}) as $id_map
+  | ($doc.relationships // []
+      | map(
+          .spdxElementId = ($id_map[.spdxElementId] // .spdxElementId)
+          | .relatedSpdxElement = ($id_map[.relatedSpdxElement] // .relatedSpdxElement)
+        )
+      | if $drop_file_ownership then
+          map(select(.relationshipType != "OTHER"))
+        else
+          .
+        end
+      | unique_by(.spdxElementId + "|" + .relationshipType + "|" + .relatedSpdxElement)
+    ) as $new_relationships
+  | $doc
+  | .packages = $new_packages
+  | .relationships = $new_relationships
+  | .documentDescribes = (($doc.documentDescribes // []) | map($id_map[.] // .) | unique)
+  | if $drop_file_ownership then del(.files) else . end
+' "${INPUT}" > "${TMP_OUT}"
+
+mv "${TMP_OUT}" "${OUTPUT}"