From ad6215044cc6aec0809ad0639498882304aa9fa0 Mon Sep 17 00:00:00 2001 From: michaelkedar Date: Thu, 9 Apr 2026 03:29:40 +0000 Subject: [PATCH 1/2] feat: implement coarse versioning in go --- go/cmd/extract_versions/main.go | 126 +++++++++++++++ go/osv/ecosystem/.gitignore | 1 + go/osv/ecosystem/apk.go | 17 +- go/osv/ecosystem/coarse_large_test.go | 104 ++++++++++++ go/osv/ecosystem/cran.go | 27 +++- go/osv/ecosystem/dpkg.go | 71 ++++++++- go/osv/ecosystem/hackage.go | 14 +- go/osv/ecosystem/maven.go | 17 +- go/osv/ecosystem/nuget.go | 4 +- go/osv/ecosystem/packagist.go | 46 +++++- go/osv/ecosystem/pub.go | 5 +- go/osv/ecosystem/pypi.go | 41 ++++- go/osv/ecosystem/rpm.go | 43 ++++- go/osv/ecosystem/rubygems.go | 12 +- go/osv/ecosystem/semver.go | 18 ++- .../testdata/regen_coarse_test_data.sh | 21 +++ go/osv/ecosystem/util.go | 149 ++++++++++++++++++ 17 files changed, 687 insertions(+), 29 deletions(-) create mode 100644 go/cmd/extract_versions/main.go create mode 100644 go/osv/ecosystem/.gitignore create mode 100644 go/osv/ecosystem/coarse_large_test.go create mode 100755 go/osv/ecosystem/testdata/regen_coarse_test_data.sh diff --git a/go/cmd/extract_versions/main.go b/go/cmd/extract_versions/main.go new file mode 100644 index 00000000000..0cb73925d39 --- /dev/null +++ b/go/cmd/extract_versions/main.go @@ -0,0 +1,126 @@ +// Package main implements a tool to extract unique versions from OSV vulnerabilities zip file. +package main + +import ( + "archive/zip" + "fmt" + "io" + "maps" + "os" + "runtime" + "slices" + "sync" + + "github.com/ossf/osv-schema/bindings/go/osvschema" + "google.golang.org/protobuf/encoding/protojson" +) + +func main() { + if len(os.Args) != 3 { + fmt.Println("Usage: extract_versions ") + os.Exit(1) + } + zipFile := os.Args[1] + outputFile := os.Args[2] + + zipReader, err := zip.OpenReader(zipFile) + if err != nil { + panic(err) + } + defer zipReader.Close() + + versionChan := make(chan string, 1000) + var wg sync.WaitGroup + + // Worker pool + numWorkers := runtime.NumCPU() + fileChan := make(chan *zip.File) + + // Start workers + for range numWorkers { + wg.Add(1) + go func() { + defer wg.Done() + for file := range fileChan { + r, err := file.Open() + if err != nil { + fmt.Fprintln(os.Stderr, err) + continue + } + bytes, err := io.ReadAll(r) + r.Close() + if err != nil { + fmt.Fprintln(os.Stderr, err) + continue + } + var vuln osvschema.Vulnerability + if err := protojson.Unmarshal(bytes, &vuln); err != nil { + fmt.Fprintln(os.Stderr, err) + continue + } + for _, affected := range vuln.GetAffected() { + for _, version := range affected.GetVersions() { + versionChan <- version + } + for _, ranges := range affected.GetRanges() { + if ranges.GetType() == osvschema.Range_GIT { + continue + } + for _, event := range ranges.GetEvents() { + if event.GetIntroduced() != "" { + versionChan <- event.GetIntroduced() + } + if event.GetFixed() != "" { + versionChan <- event.GetFixed() + } + if event.GetLastAffected() != "" { + versionChan <- event.GetLastAffected() + } + } + } + } + } + }() + } + + // Collector + allVersions := make(map[string]struct{}) + doneChan := make(chan struct{}) + go func() { + for v := range versionChan { + allVersions[v] = struct{}{} + } + close(doneChan) + }() + + // Feed files to workers + for _, file := range zipReader.File { + fileChan <- file + } + close(fileChan) + + // Wait for workers to finish + wg.Wait() + close(versionChan) + + // Wait for collector to finish + <-doneChan + + // Sort versions + vers := slices.Sorted(maps.Keys(allVersions)) + + // Write to output file + f, err := os.Create(outputFile) + if err != nil { + panic(err) + } + defer f.Close() + + for _, v := range vers { + if _, err := fmt.Fprintln(f, v); err != nil { + panic(err) + } + } + + fmt.Printf("Successfully extracted %d unique versions to %s\n", len(vers), outputFile) +} diff --git a/go/osv/ecosystem/.gitignore b/go/osv/ecosystem/.gitignore new file mode 100644 index 00000000000..4292fe34ac2 --- /dev/null +++ b/go/osv/ecosystem/.gitignore @@ -0,0 +1 @@ +testdata/all_versions.txt \ No newline at end of file diff --git a/go/osv/ecosystem/apk.go b/go/osv/ecosystem/apk.go index 3c6290ed451..d27cfc15744 100644 --- a/go/osv/ecosystem/apk.go +++ b/go/osv/ecosystem/apk.go @@ -14,7 +14,11 @@ package ecosystem -import "github.com/google/osv-scalibr/semantic" +import ( + "regexp" + + "github.com/google/osv-scalibr/semantic" +) // apkEcosystem is an ecosystem for ecosystems using Alpine Package Keeper versioning. type apkEcosystem struct{} @@ -30,7 +34,18 @@ func (e apkEcosystem) Parse(version string) (Version, error) { return SemanticVersionWrapper[semantic.AlpineVersion]{ver}, nil } +//nolint:unused +var apkCoarseVersioner = CoarseVersioner{ + Separators: regexp.MustCompile(`[.]`), + Truncate: regexp.MustCompile(`(?:\.0|[_-])`), + ImplicitSplit: true, + EmptyAs: &[]string{""}[0], +} + func (e apkEcosystem) Coarse(_ string) (string, error) { + // TODO(michaelkedar): semantic.AlpineVersion currently breaks transitivity rules + // (a < b, b < c, c < a) in some cases with invalid versions. + // Which makes coarse versions kinda broken. return "", ErrCoarseNotSupported } diff --git a/go/osv/ecosystem/coarse_large_test.go b/go/osv/ecosystem/coarse_large_test.go new file mode 100644 index 00000000000..de74645103f --- /dev/null +++ b/go/osv/ecosystem/coarse_large_test.go @@ -0,0 +1,104 @@ +package ecosystem + +import ( + "bufio" + "os" + "slices" + "testing" +) + +func TestCoarseMonotonicityLarge(t *testing.T) { + if os.Getenv("RUN_COARSE_LARGE_TEST") != "1" { + t.Skip("Skipping large test: RUN_COARSE_LARGE_TEST=1 not set") + } + + filePath := "testdata/all_versions.txt" + f, err := os.Open(filePath) + if os.IsNotExist(err) { + t.Skipf("Skipping large test: %s not found. Run tools to generate it from all.zip.", filePath) + } + if err != nil { + t.Fatal(err) + } + defer f.Close() + + var allVers []string + scanner := bufio.NewScanner(f) + for scanner.Scan() { + allVers = append(allVers, scanner.Text()) + } + if err := scanner.Err(); err != nil { + t.Fatal(err) + } + + p := DefaultProvider + + // List of unique ecosystems to test + ecosystemsToTest := []string{ + "Packagist", + "PyPI", + "Maven", + "RubyGems", + "CRAN", + "NuGet", + "Pub", + "Hackage", + "Debian", // Represets dpkgEcosystem + "Go", // Represents semverEcosystem + "Red Hat", // Represents rpmEcosystem + // "Alpine", // Represents apkEcosystem + } + + for _, ecoName := range ecosystemsToTest { + t.Run(ecoName, func(t *testing.T) { + t.Parallel() + ecoName := ecoName + e, ok := p.Get(ecoName) + if !ok { + t.Fatalf("failed to get ecosystem %s", ecoName) + } + + type vers struct { + Raw string + Parsed Version + Coarse string + } + + var goodVers []vers + for _, ver := range allVers { + v, errP := e.Parse(ver) + c, errC := e.Coarse(ver) + if (errP == nil) != (errC == nil) { + // Inconsistent failure between Parse and Coarse is usually a bug + // but for large tests we might just skip them or log them to avoid spam. + continue + } + if errP == nil { + goodVers = append(goodVers, vers{ver, v, c}) + } + } + + if len(goodVers) == 0 { + t.Logf("No valid versions found for %s in the dataset", ecoName) + return + } + + slices.SortFunc(goodVers, func(a, b vers) int { + c, err := a.Parsed.Compare(b.Parsed) + if err != nil { + t.Fatalf("error comparing! %v", err) + } + + return c + }) + + prev := goodVers[0] + for _, next := range goodVers[1:] { + if prev.Coarse > next.Coarse { + t.Errorf("Monotonicity violation: %q <= %q but coarse %s > %s", prev.Raw, next.Raw, prev.Coarse, next.Coarse) + } + prev = next + } + }) + } +} diff --git a/go/osv/ecosystem/cran.go b/go/osv/ecosystem/cran.go index 558300a904d..f916f3bf599 100644 --- a/go/osv/ecosystem/cran.go +++ b/go/osv/ecosystem/cran.go @@ -16,7 +16,9 @@ package ecosystem import ( "fmt" + "math/big" "net/url" + "strings" "github.com/google/osv-scalibr/semantic" ) @@ -44,8 +46,29 @@ func (e cranEcosystem) Parse(version string) (Version, error) { return SemanticVersionWrapper[semantic.CRANVersion]{ver}, nil } -func (e cranEcosystem) Coarse(_ string) (string, error) { - return "", ErrCoarseNotSupported +func (e cranEcosystem) Coarse(version string) (string, error) { + // this logic is lifted directly from semantic.ParseCRANVersion + // for now, treat an empty version string as valid + if version == "" { + version = "0" + } + + // dashes and periods have the same weight, so we can just normalize to periods + parts := strings.Split(strings.ReplaceAll(version, "-", "."), ".") + + comps := make([]*big.Int, 0, len(parts)) + + for _, s := range parts { + v, ok := new(big.Int).SetString(s, 10) + + if !ok { + return "", fmt.Errorf("invalid component in version: %s", s) + } + + comps = append(comps, v) + } + + return coarseFromInts(bigZero, comps...), nil } func (e cranEcosystem) IsSemver() bool { diff --git a/go/osv/ecosystem/dpkg.go b/go/osv/ecosystem/dpkg.go index c2e0fd4fa96..a42ff3fe0f4 100644 --- a/go/osv/ecosystem/dpkg.go +++ b/go/osv/ecosystem/dpkg.go @@ -14,7 +14,13 @@ package ecosystem -import "github.com/google/osv-scalibr/semantic" +import ( + "fmt" + "math/big" + "strings" + + "github.com/google/osv-scalibr/semantic" +) // dpkgEcosystem is an ecosystem for ecosystems using Debian Package versioning. type dpkgEcosystem struct{} @@ -30,8 +36,67 @@ func (e dpkgEcosystem) Parse(version string) (Version, error) { return SemanticVersionWrapper[semantic.DebianVersion]{ver}, nil } -func (e dpkgEcosystem) Coarse(_ string) (string, error) { - return "", ErrCoarseNotSupported +func (e dpkgEcosystem) Coarse(version string) (string, error) { + epochStr, rest, hasColon := strings.Cut(version, ":") + epoch := big.NewInt(0) + if hasColon { + if _, ok := epoch.SetString(epochStr, 10); !ok { + return "", fmt.Errorf("invalid epoch: %s", epochStr) + } + version = rest + } + // strip out the revision suffix to avoid potential confusion in computation + if i := strings.LastIndex(version, "-"); i >= 0 { + version = version[:i] + } + + // Versions are treated as alternating digit/non-digit strings. + parts := implicitRegex.FindAllString(version, -1) + var comps []*big.Int + + if len(parts) > 0 && !isDecimal(parts[0]) { + // dpkg versions are actually required to start with numbers. + // For some reason, semantic just treats these invalid versions as greater. + comps = append(comps, big.NewInt(100000000)) + } else { + for i := 0; i < len(parts); i += 2 { + p := parts[i] + if !isDecimal(p) { + break + } + bi := new(big.Int) + bi.SetString(p, 10) + comps = append(comps, bi) + + if i+1 >= len(parts) { + break + } + sep := parts[i+1] + // We treat the exact string '.' as a digit separator. + if sep == "." { + continue + } + + // semantic treats all letters less than all non-letters, + // but allows all non-letters. + firstChar := sep[0] + switch { + case firstChar >= 'A' && firstChar <= 'Z', + firstChar >= 'a' && firstChar <= 'z', + firstChar < '.', + firstChar == '~': + // These are allowed characters that do not trigger overflow. + default: + // Trigger an overflow because these characters are considered + // greater than a single dot separator + comps = append(comps, big.NewInt(100000000)) + } + + break + } + } + + return coarseFromInts(epoch, comps...), nil } func (e dpkgEcosystem) IsSemver() bool { diff --git a/go/osv/ecosystem/hackage.go b/go/osv/ecosystem/hackage.go index 2d3d49ee96e..b9b1a637f2e 100644 --- a/go/osv/ecosystem/hackage.go +++ b/go/osv/ecosystem/hackage.go @@ -12,12 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. -//nolint:dupl package ecosystem import ( + "errors" "fmt" "net/url" + "strings" "github.com/google/osv-scalibr/semantic" ) @@ -37,8 +38,15 @@ func (e hackageEcosystem) Parse(version string) (Version, error) { return SemanticVersionWrapper[semantic.HackageVersion]{ver}, nil } -func (e hackageEcosystem) Coarse(_ string) (string, error) { - return "", ErrCoarseNotSupported +func (e hackageEcosystem) Coarse(version string) (string, error) { + version = strings.TrimPrefix(version, "v") + // hackage versions are only allowed to be digits and dots + idx := strings.IndexFunc(version, func(r rune) bool { return (r < '0' || r > '9') && r != '.' }) + if idx >= 0 { + return "", errors.New("version contains invalid characters") + } + + return semverCoarseVersioner.Format(0, version), nil } func (e hackageEcosystem) IsSemver() bool { diff --git a/go/osv/ecosystem/maven.go b/go/osv/ecosystem/maven.go index 411b6af21d2..66825371a8d 100644 --- a/go/osv/ecosystem/maven.go +++ b/go/osv/ecosystem/maven.go @@ -14,7 +14,11 @@ package ecosystem -import "github.com/google/osv-scalibr/semantic" +import ( + "regexp" + + "github.com/google/osv-scalibr/semantic" +) type mavenEcosystem struct { p *Provider @@ -26,8 +30,15 @@ func (e mavenEcosystem) Parse(version string) (Version, error) { return SemanticVersionWrapper[semantic.MavenVersion]{semantic.ParseMavenVersion(version)}, nil } -func (e mavenEcosystem) Coarse(_ string) (string, error) { - return "", ErrCoarseNotSupported +var mavenCoarseVersioner = CoarseVersioner{ + Separators: regexp.MustCompile(`[.]`), + Truncate: regexp.MustCompile(`-`), + ImplicitSplit: true, + EmptyAs: &[]string{"0"}[0], +} + +func (e mavenEcosystem) Coarse(version string) (string, error) { + return mavenCoarseVersioner.Format(0, version), nil } func (e mavenEcosystem) IsSemver() bool { diff --git a/go/osv/ecosystem/nuget.go b/go/osv/ecosystem/nuget.go index f2844075f06..90f5f902da8 100644 --- a/go/osv/ecosystem/nuget.go +++ b/go/osv/ecosystem/nuget.go @@ -37,8 +37,8 @@ func (e nugetEcosystem) Parse(version string) (Version, error) { return SemanticVersionWrapper[semantic.NuGetVersion]{semantic.ParseNuGetVersion(version)}, nil } -func (e nugetEcosystem) Coarse(_ string) (string, error) { - return "", ErrCoarseNotSupported +func (e nugetEcosystem) Coarse(version string) (string, error) { + return semverCoarseVersioner.Format(0, strings.TrimPrefix(version, "v")), nil } func (e nugetEcosystem) IsSemver() bool { diff --git a/go/osv/ecosystem/packagist.go b/go/osv/ecosystem/packagist.go index dc16158f76c..b8737a6ab86 100644 --- a/go/osv/ecosystem/packagist.go +++ b/go/osv/ecosystem/packagist.go @@ -16,6 +16,9 @@ package ecosystem import ( "fmt" + "math/big" + "regexp" + "strings" "github.com/google/osv-scalibr/semantic" ) @@ -30,8 +33,47 @@ func (e packagistEcosystem) Parse(version string) (Version, error) { return SemanticVersionWrapper[semantic.PackagistVersion]{semantic.ParsePackagistVersion(version)}, nil } -func (e packagistEcosystem) Coarse(_ string) (string, error) { - return "", ErrCoarseNotSupported +var packagistSepRegex = regexp.MustCompile(`[-_+.]`) + +// Treats version as integers separated by ., -, _, or +. +// Treats 'p'/'pl' prefixes as maximal ints to ensure they sort after base versions +// (e.g. 1.0 < 1.0-p1). +func (e packagistEcosystem) Coarse(version string) (string, error) { + version = strings.TrimPrefix(version, "v") + version = strings.TrimPrefix(version, "V") + + sepParts := packagistSepRegex.Split(version, -1) + + var parts []string + for _, sp := range sepParts { + if sp == "" { + parts = append(parts, "") + continue + } + subParts := implicitRegex.FindAllString(sp, -1) + parts = append(parts, subParts...) + } + + var comps []*big.Int + count := 0 + for _, p := range parts { + if count >= 3 { + break + } + // 'p' and 'pl' (and similar) are considered greater than numbers + if strings.HasPrefix(p, "p") { + comps = append(comps, big.NewInt(100000000)) + } else if !isDecimal(p) || p == "" { + break + } else { + bi := new(big.Int) + bi.SetString(p, 10) + comps = append(comps, bi) + } + count++ + } + + return coarseFromInts(bigZero, comps...), nil } func (e packagistEcosystem) IsSemver() bool { diff --git a/go/osv/ecosystem/pub.go b/go/osv/ecosystem/pub.go index c71c362360a..4fc46991db9 100644 --- a/go/osv/ecosystem/pub.go +++ b/go/osv/ecosystem/pub.go @@ -17,6 +17,7 @@ package ecosystem import ( "fmt" "net/url" + "strings" "github.com/google/osv-scalibr/semantic" ) @@ -31,8 +32,8 @@ func (e pubEcosystem) Parse(version string) (Version, error) { return SemanticVersionWrapper[semantic.PubVersion]{semantic.ParsePubVersion(version)}, nil } -func (e pubEcosystem) Coarse(_ string) (string, error) { - return "", ErrCoarseNotSupported +func (e pubEcosystem) Coarse(version string) (string, error) { + return semverCoarseVersioner.Format(0, strings.TrimPrefix(version, "v")), nil } func (e pubEcosystem) IsSemver() bool { diff --git a/go/osv/ecosystem/pypi.go b/go/osv/ecosystem/pypi.go index 2f8ee65b6cb..54014cea3cb 100644 --- a/go/osv/ecosystem/pypi.go +++ b/go/osv/ecosystem/pypi.go @@ -12,12 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -//nolint:dupl package ecosystem import ( "fmt" "net/url" + "regexp" + "strconv" + "strings" "github.com/google/osv-scalibr/semantic" ) @@ -37,8 +39,41 @@ func (e pypiEcosystem) Parse(version string) (Version, error) { return SemanticVersionWrapper[semantic.PyPIVersion]{ver}, nil } -func (e pypiEcosystem) Coarse(_ string) (string, error) { - return "", ErrCoarseNotSupported +// https://peps.python.org/pep-0440/#appendix-b-parsing-version-strings-with-regular-expressions +// Capture epoch, and remainder, since that's all we need to actually parse +var pypiCanonicalRegex = regexp.MustCompile(`^\s*v?(?:(?:([0-9]+)!)?((?:[0-9]+(?:\.[0-9]+)*)(?:[-_\.]?(?:(?:a|b|c|rc|alpha|beta|pre|preview))[-_\.]?(?:[0-9]+)?)?(?:(?:-(?:[0-9]+))|(?:[-_\.]?(?:post|rev|r)[-_\.]?(?:[0-9]+)?))?(?:[-_\.]?(?:dev)[-_\.]?(?:[0-9]+)?)?)(?:\+(?:[a-z0-9]+(?:[-_\.][a-z0-9]+)*))?)\s*$`) + +var pypiCoarseVersioner = CoarseVersioner{ + Separators: regexp.MustCompile(`[.]`), + Truncate: regexp.MustCompile(`[+_-]`), + ImplicitSplit: true, + EmptyAs: nil, +} + +func (e pypiEcosystem) Coarse(version string) (string, error) { + version = strings.ToLower(version) + match := pypiCanonicalRegex.FindStringSubmatch(version) + if match == nil { + // no match, this is a legacy version which sorts before non-legacy + return "00:00000000.00000000.00000000", nil + } + epochStr := match[1] + epochlessVer := match[2] + epochStr = strings.TrimLeft(epochStr, "0") + if epochStr == "" { + epochStr = "0" + } + if len(epochStr) > 2 { + // epoch is > 99, return maximum coarse version + return "99:99999999.99999999.99999999", nil + } + epoch, err := strconv.Atoi(epochStr) + if err != nil { + // we've validated the string, so this should be unreachable + return "", err + } + + return pypiCoarseVersioner.Format(epoch, epochlessVer), nil } func (e pypiEcosystem) IsSemver() bool { diff --git a/go/osv/ecosystem/rpm.go b/go/osv/ecosystem/rpm.go index d28efec7ebb..7736ead9473 100644 --- a/go/osv/ecosystem/rpm.go +++ b/go/osv/ecosystem/rpm.go @@ -14,7 +14,13 @@ package ecosystem -import "github.com/google/osv-scalibr/semantic" +import ( + "regexp" + "strconv" + "strings" + + "github.com/google/osv-scalibr/semantic" +) // rpmEcosystem is an ecosystem for ecosystems using Red Hat Package Manager versioning. type rpmEcosystem struct{} @@ -25,8 +31,39 @@ func (e rpmEcosystem) Parse(version string) (Version, error) { return SemanticVersionWrapper[semantic.RedHatVersion]{semantic.ParseRedHatVersion(version)}, nil } -func (e rpmEcosystem) Coarse(_ string) (string, error) { - return "", ErrCoarseNotSupported +var rpmCoarseVersioner = CoarseVersioner{ + Separators: regexp.MustCompile(`[^0-9A-Za-z~^-]`), + Truncate: regexp.MustCompile(`[~^-]`), + ImplicitSplit: true, + EmptyAs: nil, +} + +func (e rpmEcosystem) Coarse(version string) (string, error) { + epochStr, rest, hasColon := strings.Cut(version, ":") + epoch := 0 + if hasColon { + if !isDecimal(epochStr) { + // epoch is not a number, treat it as 0 + return rpmCoarseVersioner.Format(0, version), nil + } + epochStr = strings.TrimLeft(epochStr, "0") + if epochStr == "" { + epochStr = "0" + } + if len(epochStr) > 2 { + // epoch is > 99, return maximum coarse version + return "99:99999999.99999999.99999999", nil + } + var err error + epoch, err = strconv.Atoi(epochStr) + if err != nil { + // we've validated the string, so this should be unreachable + return "", err + } + version = rest + } + + return rpmCoarseVersioner.Format(epoch, version), nil } func (e rpmEcosystem) IsSemver() bool { diff --git a/go/osv/ecosystem/rubygems.go b/go/osv/ecosystem/rubygems.go index 513a1c26ce4..387f9eca525 100644 --- a/go/osv/ecosystem/rubygems.go +++ b/go/osv/ecosystem/rubygems.go @@ -17,6 +17,7 @@ package ecosystem import ( "fmt" "net/url" + "regexp" "github.com/google/osv-scalibr/semantic" ) @@ -35,8 +36,15 @@ func (e rubyGemsEcosystem) Parse(version string) (Version, error) { return SemanticVersionWrapper[semantic.RubyGemsVersion]{semantic.ParseRubyGemsVersion(version)}, nil } -func (e rubyGemsEcosystem) Coarse(_ string) (string, error) { - return "", ErrCoarseNotSupported +var rubyGemsCoarseVersioner = CoarseVersioner{ + Separators: regexp.MustCompile(`[.]`), + Truncate: regexp.MustCompile(`-`), + ImplicitSplit: true, + EmptyAs: &[]string{""}[0], +} + +func (e rubyGemsEcosystem) Coarse(version string) (string, error) { + return rubyGemsCoarseVersioner.Format(0, version), nil } func (e rubyGemsEcosystem) IsSemver() bool { diff --git a/go/osv/ecosystem/semver.go b/go/osv/ecosystem/semver.go index 5c85db5905c..bb6d4625c76 100644 --- a/go/osv/ecosystem/semver.go +++ b/go/osv/ecosystem/semver.go @@ -14,7 +14,12 @@ package ecosystem -import "github.com/google/osv-scalibr/semantic" +import ( + "regexp" + "strings" + + "github.com/google/osv-scalibr/semantic" +) // semverLikeEcosystem is an ecosystem that uses a SemVer-like versions in the OSV schema. // but uses the ECOSYSTEM version type in the OSV schema. @@ -26,8 +31,15 @@ func (e semverLikeEcosystem) Parse(version string) (Version, error) { return SemanticVersionWrapper[semantic.SemverVersion]{semantic.ParseSemverVersion(version)}, nil } -func (e semverLikeEcosystem) Coarse(_ string) (string, error) { - return "", ErrCoarseNotSupported +var semverCoarseVersioner = CoarseVersioner{ + Separators: regexp.MustCompile(`[.]`), + Truncate: regexp.MustCompile(`[-+]`), + ImplicitSplit: true, + EmptyAs: nil, +} + +func (e semverLikeEcosystem) Coarse(version string) (string, error) { + return semverCoarseVersioner.Format(0, strings.TrimPrefix(version, "v")), nil } func (e semverLikeEcosystem) IsSemver() bool { diff --git a/go/osv/ecosystem/testdata/regen_coarse_test_data.sh b/go/osv/ecosystem/testdata/regen_coarse_test_data.sh new file mode 100755 index 00000000000..d2e341dbb70 --- /dev/null +++ b/go/osv/ecosystem/testdata/regen_coarse_test_data.sh @@ -0,0 +1,21 @@ +#!/bin/bash +set -e + +# Get the directory of this script +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" +MODULE_ROOT="$( cd "$SCRIPT_DIR/../../.." &> /dev/null && pwd )" + +ZIP_URL="https://storage.googleapis.com/osv-vulnerabilities/all.zip" +TEMP_ZIP="$(mktemp)" + +echo "Downloading all.zip..." +curl -o "$TEMP_ZIP" "$ZIP_URL" + +echo "Extracting versions..." +# Run the Go command from the module root +(cd "$MODULE_ROOT" && go run ./cmd/extract_versions "$TEMP_ZIP" "$SCRIPT_DIR/all_versions.txt") + +echo "Cleaning up..." +rm "$TEMP_ZIP" + +echo "Done! Generated $SCRIPT_DIR/all_versions.txt" diff --git a/go/osv/ecosystem/util.go b/go/osv/ecosystem/util.go index 74984757612..1d815bc58c5 100644 --- a/go/osv/ecosystem/util.go +++ b/go/osv/ecosystem/util.go @@ -16,7 +16,10 @@ package ecosystem import ( "fmt" + "math/big" + "regexp" "slices" + "strings" ) type parsedVersion struct { @@ -58,3 +61,149 @@ func sortVersions(e Ecosystem, versions []string) ([]string, error) { return result, sortErr } + +var ( + maxCoarseEpoch = big.NewInt(99) + maxCoarsePart = big.NewInt(99999999) + bigZero = big.NewInt(0) +) + +// CoarseVersioner contains configuration for generating coarse versions. +type CoarseVersioner struct { + Separators *regexp.Regexp // Regex for separators (e.g. types using dot notation). + Truncate *regexp.Regexp // Regex for characters to truncate after (e.g. prerelease/build suffixes). If nil, no truncation. + ImplicitSplit bool // If True, splits on transitions between digits and non-digits. + EmptyAs *string // If not nil, treats empty parts as the given string instead of removing them. If nil, removes them. +} + +var implicitRegex = regexp.MustCompile(`\d+|\D+`) + +// Format converts a version string into a coarse, lexicographically comparable string. +func (v CoarseVersioner) Format(epoch int, version string) string { + if version == "0" { + return coarseFromInts(big.NewInt(int64(epoch)), bigZero, bigZero, bigZero) + } + + main := version + if v.Truncate != nil { + // Truncate off trailing components (e.g. prerelease/build) + main = v.Truncate.Split(version, 2)[0] + } + + parts := v.Separators.Split(main, -1) + if v.ImplicitSplit { + // Also split on transitions between digits and non-digits + var splitParts []string + for _, part := range parts { + if part == "" { + splitParts = append(splitParts, "") + continue + } + splitParts = append(splitParts, implicitRegex.FindAllString(part, -1)...) + } + parts = splitParts + } + + filteredParts := make([]string, 0, len(parts)) + // Filter empty parts or treat as zero + if v.EmptyAs != nil { + for _, p := range parts { + if p == "" { + filteredParts = append(filteredParts, *v.EmptyAs) + } else { + filteredParts = append(filteredParts, p) + } + } + } else { + for _, p := range parts { + if p != "" { + filteredParts = append(filteredParts, p) + } + } + } + parts = filteredParts + + // Extract up to 3 integer components + var components []*big.Int + for _, p := range parts { + if len(components) >= 3 { + break + } + if !isDecimal(p) { + break + } + bi := new(big.Int) + if _, ok := bi.SetString(p, 10); ok { + components = append(components, bi) + } else { + break + } + } + + return coarseFromInts(big.NewInt(int64(epoch)), components...) +} + +func isDecimal(s string) bool { + if s == "" { + return false + } + for _, r := range s { + if r < '0' || r > '9' { + return false + } + } + + return true +} + +func coarseFromInts(epoch *big.Int, parts ...*big.Int) string { + // treat nil values as 0 + if epoch == nil { + epoch = bigZero + } + // if, somehow, the epoch is < 0, set whole string to min value + if epoch.Cmp(bigZero) < 0 { + return "00:00000000.00000000.00000000" + } + // if epoch > maximum, set whole string to max value + if epoch.Cmp(maxCoarseEpoch) > 0 { + return "99:99999999.99999999.99999999" + } + epochStr := fmt.Sprintf("%02d", epoch.Int64()) + partStrs := make([]string, 0, 3) + for _, part := range parts { + if len(partStrs) >= 3 { + break + } + // treat nil values as 0 + if part == nil { + part = bigZero + } + // if, somehow, there's a negative integer part, + // set it and the remaining parts to 0 + if part.Cmp(bigZero) < 0 { + for len(partStrs) < 3 { + partStrs = append(partStrs, "00000000") + } + + break + } + // if the part is above the maximum value, + // set it and remaining parts to max + if part.Cmp(maxCoarsePart) > 0 { + for len(partStrs) < 3 { + partStrs = append(partStrs, "99999999") + } + + break + } + partInt := part.Int64() // maxPart < int64 max so this is fine + partStrs = append(partStrs, fmt.Sprintf("%08d", partInt)) + } + // append 0's if we don't have enough parts + for len(partStrs) < 3 { + partStrs = append(partStrs, "00000000") + } + + return epochStr + ":" + strings.Join(partStrs, ".") +} From 44eddb1b4a9748ab444e03812d80fbf40662db79 Mon Sep 17 00:00:00 2001 From: michaelkedar Date: Fri, 10 Apr 2026 00:28:15 +0000 Subject: [PATCH 2/2] =?UTF-8?q?=F0=9F=98=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- go/osv/ecosystem/coarse_fuzz_test.go | 301 ++++++++++++++++++ go/osv/ecosystem/dpkg.go | 7 +- go/osv/ecosystem/packagist.go | 13 + .../FuzzDpkgMonotonicity/637110e9290164fb | 3 + .../FuzzDpkgMonotonicity/800ea9d22c213672 | 3 + .../FuzzDpkgMonotonicity/d1988bf0dfaadeaa | 3 + .../c80af087e9ffbd87 | 3 + 7 files changed, 332 insertions(+), 1 deletion(-) create mode 100644 go/osv/ecosystem/coarse_fuzz_test.go create mode 100644 go/osv/ecosystem/testdata/fuzz/FuzzDpkgMonotonicity/637110e9290164fb create mode 100644 go/osv/ecosystem/testdata/fuzz/FuzzDpkgMonotonicity/800ea9d22c213672 create mode 100644 go/osv/ecosystem/testdata/fuzz/FuzzDpkgMonotonicity/d1988bf0dfaadeaa create mode 100644 go/osv/ecosystem/testdata/fuzz/FuzzPackagistMonotonicity/c80af087e9ffbd87 diff --git a/go/osv/ecosystem/coarse_fuzz_test.go b/go/osv/ecosystem/coarse_fuzz_test.go new file mode 100644 index 00000000000..3aaaf871797 --- /dev/null +++ b/go/osv/ecosystem/coarse_fuzz_test.go @@ -0,0 +1,301 @@ +package ecosystem + +import ( + "testing" +) + +func checkCoarseMonotonicityRaw(t *testing.T, e Ecosystem, v1Str, v2Str string) { + t.Helper() + v1, errP1 := e.Parse(v1Str) + v2, errP2 := e.Parse(v2Str) + + c1, errC1 := e.Coarse(v1Str) + c2, errC2 := e.Coarse(v2Str) + + if (errP1 == nil) != (errC1 == nil) { + t.Fatalf("Parse and Coarse success mismatch for %q: Parse err=%v, Coarse err=%v", v1Str, errP1, errC1) + } + if (errP2 == nil) != (errC2 == nil) { + t.Fatalf("Parse and Coarse success mismatch for %q: Parse err=%v, Coarse err=%v", v2Str, errP2, errC2) + } + + if errP1 != nil || errP2 != nil { + return // Skip monotonicity check if any failed + } + + comp, err := v1.Compare(v2) + if err != nil { + return + } + + if comp < 0 && c1 > c2 { + t.Errorf("Monotonicity violation: %q < %q but coarse %s > %s", v1Str, v2Str, c1, c2) + } + if comp > 0 && c1 < c2 { + t.Errorf("Monotonicity violation: %q > %q but coarse %s < %s", v1Str, v2Str, c1, c2) + } + if comp == 0 && c1 != c2 { + t.Errorf("Equality violation: %q == %q but coarse %s != %s", v1Str, v2Str, c1, c2) + } +} + +func FuzzPackagistMonotonicity(f *testing.F) { + seeds := []string{ + "1.0.0", "1.0.1", + "v1.0.0", "v1.0.1", + "1.0.0-beta1", + "0__1", "00", + "1.0.0+build1", + "1.0.0+bedrock-1.17.10", + "1..0", + "1.-.0", + } + for i, v1 := range seeds { + for j := i; j < len(seeds); j++ { + f.Add(v1, seeds[j]) + } + } + f.Fuzz(func(t *testing.T, v1Str, v2Str string) { + e, ok := DefaultProvider.Get("Packagist") + if !ok { + t.Fatal("Packagist not found") + } + checkCoarseMonotonicityRaw(t, e, v1Str, v2Str) + }) +} + +func FuzzPyPIMonotonicity(f *testing.F) { + seeds := []string{ + "1.0.0", "1.0.1", + "1.0.0a1", "1.0.0.post1", + "1.0.dev1", "1.0", + "1.0.post1", "1.0.post2", + "0.0.1.post10035392509", + "2013-01-21T20:33:09+0100", + } + for i, v1 := range seeds { + for j := i; j < len(seeds); j++ { + f.Add(v1, seeds[j]) + } + } + f.Fuzz(func(t *testing.T, v1Str, v2Str string) { + e, ok := DefaultProvider.Get("PyPI") + if !ok { + t.Fatal("PyPI not found") + } + checkCoarseMonotonicityRaw(t, e, v1Str, v2Str) + }) +} + +func FuzzMavenMonotonicity(f *testing.F) { + seeds := []string{ + "1.0.0", "1.0.1", + "1.0-alpha-1", "1.0.0.RELEASE", + "1.0", "1.0-SNAPSHOT", + "0.0", "alpha-alpha", + "0.0.0-2024-04-02T00-00-00-special-v20.9-plus-propertydatafetcher-fix", + "$%7Brevision%7D231.v678984136a_0b_", + "1..0", + "1--alpha", + } + for i, v1 := range seeds { + for j := i; j < len(seeds); j++ { + f.Add(v1, seeds[j]) + } + } + f.Fuzz(func(t *testing.T, v1Str, v2Str string) { + e, ok := DefaultProvider.Get("Maven") + if !ok { + t.Fatal("Maven not found") + } + checkCoarseMonotonicityRaw(t, e, v1Str, v2Str) + }) +} + +func FuzzRubyGemsMonotonicity(f *testing.F) { + seeds := []string{ + "1.0.0", "1.0.1", + "1.0.0.a", + "0.0.0.1", + } + for i, v1 := range seeds { + for j := i; j < len(seeds); j++ { + f.Add(v1, seeds[j]) + } + } + f.Fuzz(func(t *testing.T, v1Str, v2Str string) { + e, ok := DefaultProvider.Get("RubyGems") + if !ok { + t.Fatal("RubyGems not found") + } + checkCoarseMonotonicityRaw(t, e, v1Str, v2Str) + }) +} + +func FuzzCRANMonotonicity(f *testing.F) { + seeds := []string{ + "1.0-1", "1.0-2", + "1.0.1", "1.0.2", + } + for i, v1 := range seeds { + for j := i; j < len(seeds); j++ { + f.Add(v1, seeds[j]) + } + } + f.Fuzz(func(t *testing.T, v1Str, v2Str string) { + e, ok := DefaultProvider.Get("CRAN") + if !ok { + t.Fatal("CRAN not found") + } + checkCoarseMonotonicityRaw(t, e, v1Str, v2Str) + }) +} + +func FuzzNuGetMonotonicity(f *testing.F) { + seeds := []string{ + "1.0.0", "1.0.1", + "1.0.0-alpha", + "0.10.1.1", + "0.0.0-20210218195015-ae50d9b99025", + } + for i, v1 := range seeds { + for j := i; j < len(seeds); j++ { + f.Add(v1, seeds[j]) + } + } + f.Fuzz(func(t *testing.T, v1Str, v2Str string) { + e, ok := DefaultProvider.Get("NuGet") + if !ok { + t.Fatal("NuGet not found") + } + checkCoarseMonotonicityRaw(t, e, v1Str, v2Str) + }) +} + +func FuzzPubMonotonicity(f *testing.F) { + seeds := []string{ + "1.0.0", "1.0.1", + "1.0.0-alpha", + "0.13.0-nullsafety.0", + "0.1.0+1", + } + for i, v1 := range seeds { + for j := i; j < len(seeds); j++ { + f.Add(v1, seeds[j]) + } + } + f.Fuzz(func(t *testing.T, v1Str, v2Str string) { + e, ok := DefaultProvider.Get("Pub") + if !ok { + t.Fatal("Pub not found") + } + checkCoarseMonotonicityRaw(t, e, v1Str, v2Str) + }) +} + +func FuzzHackageMonotonicity(f *testing.F) { + seeds := []string{ + "1.0.0", "1.0.1", + "1.0.0.1", "1.0.0.2", + } + for i, v1 := range seeds { + for j := i; j < len(seeds); j++ { + f.Add(v1, seeds[j]) + } + } + f.Fuzz(func(t *testing.T, v1Str, v2Str string) { + e, ok := DefaultProvider.Get("Hackage") + if !ok { + t.Fatal("Hackage not found") + } + checkCoarseMonotonicityRaw(t, e, v1Str, v2Str) + }) +} + +func FuzzDpkgMonotonicity(f *testing.F) { + seeds := []string{ + "1:1.0-1", "1:1.0-2", + "1.0-1", "1.0-2", + "1.0", "1.1", + "1.0~rc1-1", + "1:0.0+git20161013.8b4af36+dfsg-3", + "0.0+git20160525~9bf299c-2", + "0.0.20-1.1~deb13u1", + } + for i, v1 := range seeds { + for j := i; j < len(seeds); j++ { + f.Add(v1, seeds[j]) + } + } + f.Fuzz(func(t *testing.T, v1Str, v2Str string) { + e, ok := DefaultProvider.Get("Debian") + if !ok { + t.Fatal("Debian not found") + } + checkCoarseMonotonicityRaw(t, e, v1Str, v2Str) + }) +} + +func FuzzSemVerMonotonicity(f *testing.F) { + seeds := []string{ + "v1.0.0", "v1.0.1", + "v1.0.0-alpha", + "v0.0.0-alpha.0", + "v0.0.0-dev", + "v0.0.0-5VtLmtixx6V5PkcW", + "v0.0.0-20231016150651-428517fef5b9", + } + for i, v1 := range seeds { + for j := i; j < len(seeds); j++ { + f.Add(v1, seeds[j]) + } + } + f.Fuzz(func(t *testing.T, v1Str, v2Str string) { + e, ok := DefaultProvider.Get("Go") + if !ok { + t.Fatal("Go not found") + } + checkCoarseMonotonicityRaw(t, e, v1Str, v2Str) + }) +} + +func FuzzRPMMonotonicity(f *testing.F) { + seeds := []string{ + "1:1.0-1", "1:1.0-2", + "1.0-1", "1.0-2", + "0.1.0~90-1.1", + "0.1.9+git.0.66be0d8-bp154.2.6.1", + } + for i, v1 := range seeds { + for j := i; j < len(seeds); j++ { + f.Add(v1, seeds[j]) + } + } + f.Fuzz(func(t *testing.T, v1Str, v2Str string) { + e, ok := DefaultProvider.Get("Red Hat") + if !ok { + t.Fatal("Red Hat not found") + } + checkCoarseMonotonicityRaw(t, e, v1Str, v2Str) + }) +} + +func FuzzAPKMonotonicity(f *testing.F) { + f.Skip("Skipping Alpine fuzz test due to known transitivity violations") + seeds := []string{ + "1.0.0", "1.0.1", + "1.0.0-r1", "1.0.0-r2", + } + for i, v1 := range seeds { + for j := i; j < len(seeds); j++ { + f.Add(v1, seeds[j]) + } + } + f.Fuzz(func(t *testing.T, v1Str, v2Str string) { + e, ok := DefaultProvider.Get("Alpine") + if !ok { + t.Fatal("Alpine not found") + } + checkCoarseMonotonicityRaw(t, e, v1Str, v2Str) + }) +} diff --git a/go/osv/ecosystem/dpkg.go b/go/osv/ecosystem/dpkg.go index a42ff3fe0f4..fc59e3b38bc 100644 --- a/go/osv/ecosystem/dpkg.go +++ b/go/osv/ecosystem/dpkg.go @@ -37,6 +37,7 @@ func (e dpkgEcosystem) Parse(version string) (Version, error) { } func (e dpkgEcosystem) Coarse(version string) (string, error) { + version = strings.TrimSpace(version) epochStr, rest, hasColon := strings.Cut(version, ":") epoch := big.NewInt(0) if hasColon { @@ -57,7 +58,9 @@ func (e dpkgEcosystem) Coarse(version string) (string, error) { if len(parts) > 0 && !isDecimal(parts[0]) { // dpkg versions are actually required to start with numbers. // For some reason, semantic just treats these invalid versions as greater. - comps = append(comps, big.NewInt(100000000)) + if !strings.HasPrefix(parts[0], "~") { + comps = append(comps, big.NewInt(100000000)) + } } else { for i := 0; i < len(parts); i += 2 { p := parts[i] @@ -86,6 +89,8 @@ func (e dpkgEcosystem) Coarse(version string) (string, error) { firstChar < '.', firstChar == '~': // These are allowed characters that do not trigger overflow. + case strings.HasPrefix(sep, ".~"): + // "0.~" < "0." default: // Trigger an overflow because these characters are considered // greater than a single dot separator diff --git a/go/osv/ecosystem/packagist.go b/go/osv/ecosystem/packagist.go index b8737a6ab86..153b48e6b37 100644 --- a/go/osv/ecosystem/packagist.go +++ b/go/osv/ecosystem/packagist.go @@ -15,6 +15,7 @@ package ecosystem import ( + "errors" "fmt" "math/big" "regexp" @@ -30,6 +31,15 @@ type packagistEcosystem struct { var _ Enumerable = packagistEcosystem{} func (e packagistEcosystem) Parse(version string) (Version, error) { + if strings.Contains(version, "#") { + // A quirk of packagist comparison is that to compare numbers against non-numbers, numbers are replaced with '#' + // This means versions with a literal '#' compare equally to every number. + // e.g. 1.0 == #.0 and 2.0 == #.0 (but 1.0 < 2.0) + // If we allow this, we cannot guarantee an ordering of versions, so just treat any versions with # as invalid. + // No current Packagist vulns in OSV have '#' in their versions. + return nil, errors.New("packagist version may not contain '#'") + } + return SemanticVersionWrapper[semantic.PackagistVersion]{semantic.ParsePackagistVersion(version)}, nil } @@ -39,6 +49,9 @@ var packagistSepRegex = regexp.MustCompile(`[-_+.]`) // Treats 'p'/'pl' prefixes as maximal ints to ensure they sort after base versions // (e.g. 1.0 < 1.0-p1). func (e packagistEcosystem) Coarse(version string) (string, error) { + if strings.Contains(version, "#") { + return "", errors.New("packagist version may not contain '#'") + } version = strings.TrimPrefix(version, "v") version = strings.TrimPrefix(version, "V") diff --git a/go/osv/ecosystem/testdata/fuzz/FuzzDpkgMonotonicity/637110e9290164fb b/go/osv/ecosystem/testdata/fuzz/FuzzDpkgMonotonicity/637110e9290164fb new file mode 100644 index 00000000000..2497805ef83 --- /dev/null +++ b/go/osv/ecosystem/testdata/fuzz/FuzzDpkgMonotonicity/637110e9290164fb @@ -0,0 +1,3 @@ +go test fuzz v1 +string("1") +string("~") diff --git a/go/osv/ecosystem/testdata/fuzz/FuzzDpkgMonotonicity/800ea9d22c213672 b/go/osv/ecosystem/testdata/fuzz/FuzzDpkgMonotonicity/800ea9d22c213672 new file mode 100644 index 00000000000..3161492b948 --- /dev/null +++ b/go/osv/ecosystem/testdata/fuzz/FuzzDpkgMonotonicity/800ea9d22c213672 @@ -0,0 +1,3 @@ +go test fuzz v1 +string("0.~") +string("0.") diff --git a/go/osv/ecosystem/testdata/fuzz/FuzzDpkgMonotonicity/d1988bf0dfaadeaa b/go/osv/ecosystem/testdata/fuzz/FuzzDpkgMonotonicity/d1988bf0dfaadeaa new file mode 100644 index 00000000000..3738bc196c5 --- /dev/null +++ b/go/osv/ecosystem/testdata/fuzz/FuzzDpkgMonotonicity/d1988bf0dfaadeaa @@ -0,0 +1,3 @@ +go test fuzz v1 +string("1.0") +string("1. ") diff --git a/go/osv/ecosystem/testdata/fuzz/FuzzPackagistMonotonicity/c80af087e9ffbd87 b/go/osv/ecosystem/testdata/fuzz/FuzzPackagistMonotonicity/c80af087e9ffbd87 new file mode 100644 index 00000000000..e790d8582bf --- /dev/null +++ b/go/osv/ecosystem/testdata/fuzz/FuzzPackagistMonotonicity/c80af087e9ffbd87 @@ -0,0 +1,3 @@ +go test fuzz v1 +string("v1+0+0") +string("#100")