dedupe-spdx-gomod.sh 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121
  1. #!/usr/bin/env bash
  2. set -euo pipefail
  3. usage() {
  4. cat <<'EOF'
  5. Usage:
  6. dedupe-spdx-gomod.sh --input INPUT_SPDX_JSON --output OUTPUT_SPDX_JSON [--drop-file-ownership]
  7. Description:
  8. Deduplicates SPDX package nodes by purl (fallback: name@version), rewrites
  9. relationships to canonical SPDX IDs, and deduplicates relationships.
  10. Optional flag --drop-file-ownership removes file ownership-heavy data:
  11. - drops relationshipType OTHER
  12. - removes files[] entries
  13. EOF
  14. }
  15. INPUT=""
  16. OUTPUT=""
  17. DROP_FILE_OWNERSHIP="false"
  18. while [[ $# -gt 0 ]]; do
  19. case "$1" in
  20. --input)
  21. if [[ -z "${2:-}" || "${2:-}" == --* || "${2:-}" == -* ]]; then
  22. echo "Error: --input requires a value" >&2
  23. usage >&2
  24. exit 1
  25. fi
  26. INPUT="${2:-}"
  27. shift 2
  28. ;;
  29. --output)
  30. if [[ -z "${2:-}" || "${2:-}" == --* || "${2:-}" == -* ]]; then
  31. echo "Error: --output requires a value" >&2
  32. usage >&2
  33. exit 1
  34. fi
  35. OUTPUT="${2:-}"
  36. shift 2
  37. ;;
  38. --drop-file-ownership)
  39. DROP_FILE_OWNERSHIP="true"
  40. shift
  41. ;;
  42. -h|--help)
  43. usage
  44. exit 0
  45. ;;
  46. *)
  47. echo "Unknown argument: $1" >&2
  48. usage >&2
  49. exit 1
  50. ;;
  51. esac
  52. done
  53. if [[ -z "${INPUT}" || -z "${OUTPUT}" ]]; then
  54. usage >&2
  55. exit 1
  56. fi
  57. if [[ ! -f "${INPUT}" ]]; then
  58. echo "Input file not found: ${INPUT}" >&2
  59. exit 1
  60. fi
  61. TMP_OUT="$(mktemp)"
  62. jq \
  63. --argjson drop_file_ownership "${DROP_FILE_OWNERSHIP}" '
  64. def purl:
  65. ((.externalRefs // [])
  66. | map(select(.referenceType == "purl") | .referenceLocator)
  67. | first);
  68. def package_key:
  69. if (purl // "") != "" then
  70. "purl|" + purl
  71. else
  72. # No purl means identity is uncertain across ecosystems/catalogers.
  73. # Keep the key non-destructive by including provenance fields and SPDXID.
  74. "nopurl|spdxid=" + (.SPDXID // "") +
  75. "|name=" + (.name // "") +
  76. "|version=" + (.versionInfo // "") +
  77. "|supplier=" + (.supplier // "") +
  78. "|sourceInfo=" + (.sourceInfo // "")
  79. end;
  80. . as $doc
  81. | ($doc.packages // [] | map(. + {__dedupe_key: package_key})) as $pkgs
  82. | ($pkgs
  83. | sort_by(.__dedupe_key)
  84. | group_by(.__dedupe_key)
  85. | map({
  86. canonical_spdxid: .[0].SPDXID,
  87. all_spdxids: map(.SPDXID),
  88. canonical_pkg: (.[0] | del(.__dedupe_key))
  89. })) as $groups
  90. | ($groups | map(.canonical_pkg)) as $new_packages
  91. | ($groups | map(.all_spdxids[] as $old | {($old): .canonical_spdxid}) | add // {}) as $id_map
  92. | ($doc.relationships // []
  93. | map(
  94. .spdxElementId = ($id_map[.spdxElementId] // .spdxElementId)
  95. | .relatedSpdxElement = ($id_map[.relatedSpdxElement] // .relatedSpdxElement)
  96. )
  97. | if $drop_file_ownership then
  98. map(select(.relationshipType != "OTHER"))
  99. else
  100. .
  101. end
  102. | unique_by(.spdxElementId + "|" + .relationshipType + "|" + .relatedSpdxElement)
  103. ) as $new_relationships
  104. | $doc
  105. | .packages = $new_packages
  106. | .relationships = $new_relationships
  107. | .documentDescribes = (($doc.documentDescribes // []) | map($id_map[.] // .) | unique)
  108. | if $drop_file_ownership then del(.files) else . end
  109. ' "${INPUT}" > "${TMP_OUT}"
  110. mv "${TMP_OUT}" "${OUTPUT}"