| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121 |
- #!/usr/bin/env bash
- set -euo pipefail
- usage() {
- cat <<'EOF'
- Usage:
- dedupe-spdx-gomod.sh --input INPUT_SPDX_JSON --output OUTPUT_SPDX_JSON [--drop-file-ownership]
- Description:
- Deduplicates SPDX package nodes by purl (fallback: name@version), rewrites
- relationships to canonical SPDX IDs, and deduplicates relationships.
- Optional flag --drop-file-ownership removes file ownership-heavy data:
- - drops relationshipType OTHER
- - removes files[] entries
- EOF
- }
- INPUT=""
- OUTPUT=""
- DROP_FILE_OWNERSHIP="false"
- while [[ $# -gt 0 ]]; do
- case "$1" in
- --input)
- if [[ -z "${2:-}" || "${2:-}" == --* || "${2:-}" == -* ]]; then
- echo "Error: --input requires a value" >&2
- usage >&2
- exit 1
- fi
- INPUT="${2:-}"
- shift 2
- ;;
- --output)
- if [[ -z "${2:-}" || "${2:-}" == --* || "${2:-}" == -* ]]; then
- echo "Error: --output requires a value" >&2
- usage >&2
- exit 1
- fi
- OUTPUT="${2:-}"
- shift 2
- ;;
- --drop-file-ownership)
- DROP_FILE_OWNERSHIP="true"
- shift
- ;;
- -h|--help)
- usage
- exit 0
- ;;
- *)
- echo "Unknown argument: $1" >&2
- usage >&2
- exit 1
- ;;
- esac
- done
- if [[ -z "${INPUT}" || -z "${OUTPUT}" ]]; then
- usage >&2
- exit 1
- fi
- if [[ ! -f "${INPUT}" ]]; then
- echo "Input file not found: ${INPUT}" >&2
- exit 1
- fi
- TMP_OUT="$(mktemp)"
- jq \
- --argjson drop_file_ownership "${DROP_FILE_OWNERSHIP}" '
- def purl:
- ((.externalRefs // [])
- | map(select(.referenceType == "purl") | .referenceLocator)
- | first);
- def package_key:
- if (purl // "") != "" then
- "purl|" + purl
- else
- # No purl means identity is uncertain across ecosystems/catalogers.
- # Keep the key non-destructive by including provenance fields and SPDXID.
- "nopurl|spdxid=" + (.SPDXID // "") +
- "|name=" + (.name // "") +
- "|version=" + (.versionInfo // "") +
- "|supplier=" + (.supplier // "") +
- "|sourceInfo=" + (.sourceInfo // "")
- end;
- . as $doc
- | ($doc.packages // [] | map(. + {__dedupe_key: package_key})) as $pkgs
- | ($pkgs
- | sort_by(.__dedupe_key)
- | group_by(.__dedupe_key)
- | map({
- canonical_spdxid: .[0].SPDXID,
- all_spdxids: map(.SPDXID),
- canonical_pkg: (.[0] | del(.__dedupe_key))
- })) as $groups
- | ($groups | map(.canonical_pkg)) as $new_packages
- | ($groups | map(.all_spdxids[] as $old | {($old): .canonical_spdxid}) | add // {}) as $id_map
- | ($doc.relationships // []
- | map(
- .spdxElementId = ($id_map[.spdxElementId] // .spdxElementId)
- | .relatedSpdxElement = ($id_map[.relatedSpdxElement] // .relatedSpdxElement)
- )
- | if $drop_file_ownership then
- map(select(.relationshipType != "OTHER"))
- else
- .
- end
- | unique_by(.spdxElementId + "|" + .relationshipType + "|" + .relatedSpdxElement)
- ) as $new_relationships
- | $doc
- | .packages = $new_packages
- | .relationships = $new_relationships
- | .documentDescribes = (($doc.documentDescribes // []) | map($id_map[.] // .) | unique)
- | if $drop_file_ownership then del(.files) else . end
- ' "${INPUT}" > "${TMP_OUT}"
- mv "${TMP_OUT}" "${OUTPUT}"
|