codeintel: Add additional canonicalization transformations (#45175)

2026-02-06 18:51:59 +00:00 · 2022-12-05 11:58:02 -06:00 · 2022-12-05 11:58:02 -06:00 · 83a87448e7
commit 83a87448e7
parent bfba204ffd
3 changed files with 172 additions and 34 deletions
--- a/enterprise/internal/codeintel/shared/types/scip_canonicalize.go
+++ b/enterprise/internal/codeintel/shared/types/scip_canonicalize.go
@ -1,75 +1,65 @@
 package types

-import (
-	"sort"
-
-	"github.com/sourcegraph/scip/bindings/go/scip"
-)
+import "github.com/sourcegraph/scip/bindings/go/scip"

 // CanonicalizeDocument deterministically re-orders the fields of the given document.
-// The input is modified in-place but returned for convenience.
 func CanonicalizeDocument(document *scip.Document) *scip.Document {
-	_ = CanonicalizeOccurrences(document.Occurrences)
-	_ = CanonicalizeSymbols(document.Symbols)
-
+	document.Occurrences = CanonicalizeOccurrences(document.Occurrences)
+	document.Symbols = CanonicalizeSymbols(document.Symbols)
 	return document
 }

 // CanonicalizeOccurrences deterministically re-orders the fields of the given occurrence slice.
-// The input is modified in-place but returned for convenience.
 func CanonicalizeOccurrences(occurrences []*scip.Occurrence) []*scip.Occurrence {
-	for _, occurrence := range occurrences {
-		_ = CanonicalizeOccurrence(occurrence)
+	canonicalized := make([]*scip.Occurrence, 0, len(occurrences))
+	for _, occurrence := range FlattenOccurrences(occurrences) {
+		canonicalized = append(canonicalized, CanonicalizeOccurrence(occurrence))
 	}

-	return SortOccurrences(occurrences)
+	return SortOccurrences(canonicalized)
 }

 // CanonicalizeOccurrence deterministically re-orders the fields of the given occurrence.
-// The input is modified in-place but returned for convenience.
 func CanonicalizeOccurrence(occurrence *scip.Occurrence) *scip.Occurrence {
 	// Express ranges as three-components if possible
 	occurrence.Range = scip.NewRange(occurrence.Range).SCIPRange()
-	_ = CanonicalizeDiagnostics(occurrence.Diagnostics)
+	occurrence.Diagnostics = CanonicalizeDiagnostics(occurrence.Diagnostics)
 	return occurrence
 }

 // CanonicalizeDiagnostics deterministically re-orders the fields of the given diagnostic slice.
-// The input is modified in-place but returned for convenience.
 func CanonicalizeDiagnostics(diagnostics []*scip.Diagnostic) []*scip.Diagnostic {
+	canonicalized := make([]*scip.Diagnostic, 0, len(diagnostics))
 	for _, diagnostic := range diagnostics {
-		_ = CanonicalizeDiagnostic(diagnostic)
+		canonicalized = append(canonicalized, CanonicalizeDiagnostic(diagnostic))
 	}

-	return SortDiagnostics(diagnostics)
+	return SortDiagnostics(canonicalized)
 }

 // CanonicalizeDiagnostic deterministically re-orders the fields of the given diagnostic.
-// The input is modified in-place but returned for convenience.
 func CanonicalizeDiagnostic(diagnostic *scip.Diagnostic) *scip.Diagnostic {
-	sort.Slice(diagnostic.Tags, func(i, j int) bool {
-		return diagnostic.Tags[i] < diagnostic.Tags[j]
-	})
-
+	diagnostic.Tags = SortDiagnosticTags(diagnostic.Tags)
 	return diagnostic
 }

 // CanonicalizeSymbols deterministically re-orders the fields of the given symbols slice.
-// The input is modified in-place but returned for convenience.
 func CanonicalizeSymbols(symbols []*scip.SymbolInformation) []*scip.SymbolInformation {
-	for _, symbol := range symbols {
-		_ = CanonicalizeSymbol(symbol)
+	canonicalized := make([]*scip.SymbolInformation, 0, len(symbols))
+	for _, symbol := range FlattenSymbols(symbols) {
+		canonicalized = append(canonicalized, CanonicalizeSymbol(symbol))
 	}

-	return SortSymbols(symbols)
+	return SortSymbols(canonicalized)
 }

 // CanonicalizeSymbol deterministically re-orders the fields of the given symbol.
-// The input is modified in-place but returned for convenience.
 func CanonicalizeSymbol(symbol *scip.SymbolInformation) *scip.SymbolInformation {
-	sort.Slice(symbol.Relationships, func(i, j int) bool {
-		return symbol.Relationships[i].Symbol < symbol.Relationships[j].Symbol
-	})
-
+	symbol.Relationships = CanonicalizeRelationships(symbol.Relationships)
 	return symbol
 }
+
+// CanonicalizeRelationships deterministically re-orders the fields of the given relationship slice.
+func CanonicalizeRelationships(relationships []*scip.Relationship) []*scip.Relationship {
+	return SortRelationships(FlattenRelationship(relationships))
+}
--- a/enterprise/internal/codeintel/shared/types/scip_flatten.go
+++ b/enterprise/internal/codeintel/shared/types/scip_flatten.go
@ -0,0 +1,107 @@
+package types
+
+import "github.com/sourcegraph/scip/bindings/go/scip"
+
+// FlattenDocuments merges elements of the given slice with the same relative path. This allows us to make
+// the assumption post-canonicalization that each index has one representation of a given document path in
+// the database. This function returns a new slice.
+func FlattenDocuments(documents []*scip.Document) []*scip.Document {
+	documentMap := make(map[string]*scip.Document, len(documents))
+	for _, document := range documents {
+		existing, ok := documentMap[document.RelativePath]
+		if !ok {
+			documentMap[document.RelativePath] = document
+			continue
+		}
+		if existing.Language != document.Language {
+			_ = 0 // TODO - warn?
+		}
+
+		existing.Symbols = append(existing.Symbols, document.Symbols...)
+		existing.Occurrences = append(existing.Occurrences, document.Occurrences...)
+	}
+
+	flattened := make([]*scip.Document, 0, len(documentMap))
+	for _, document := range documentMap {
+		flattened = append(flattened, document)
+	}
+
+	return flattened
+}
+
+// FlattenSymbol merges elements of the given slice with the same symbol name. This allows us to make the
+// assumption post-canonicalization that each index and document refer to one symbol metadata object uniquely.
+// This function returns a new slice.
+func FlattenSymbols(symbols []*scip.SymbolInformation) []*scip.SymbolInformation {
+	symbolMap := make(map[string]*scip.SymbolInformation, len(symbols))
+	for _, symbol := range symbols {
+		existing, ok := symbolMap[symbol.Symbol]
+		if !ok {
+			symbolMap[symbol.Symbol] = symbol
+			continue
+		}
+
+		existing.Documentation = append(existing.Documentation, symbol.Documentation...)
+		existing.Relationships = append(existing.Relationships, symbol.Relationships...)
+	}
+
+	flattened := make([]*scip.SymbolInformation, 0, len(symbolMap))
+	for _, symbol := range symbolMap {
+		flattened = append(flattened, symbol)
+	}
+
+	return flattened
+}
+
+// FlattenOccurrences merges elements of the given slice with equivalent bounds. This function returns a new slice.
+func FlattenOccurrences(occurrences []*scip.Occurrence) []*scip.Occurrence {
+	if len(occurrences) == 0 {
+		return occurrences
+	}
+
+	_ = SortOccurrences(occurrences)
+	flattened := make([]*scip.Occurrence, 0, len(occurrences))
+	flattened = append(flattened, occurrences[0])
+
+	for _, occurrence := range occurrences[1:] {
+		top := flattened[len(flattened)-1]
+
+		if !rawRangesEqual(top.Range, occurrence.Range) {
+			flattened = append(flattened, occurrence)
+			continue
+		}
+		if top.SyntaxKind != occurrence.SyntaxKind {
+			_ = 0 // TODO - warn?
+		}
+
+		top.SymbolRoles |= occurrence.SymbolRoles
+		top.OverrideDocumentation = append(top.OverrideDocumentation, occurrence.OverrideDocumentation...)
+		top.Diagnostics = append(top.Diagnostics, occurrence.Diagnostics...)
+	}
+
+	return flattened
+}
+
+// FlattenRelationship merges elements of the given slice with equivalent symbol names. This function returns a new
+// slice.
+func FlattenRelationship(relationships []*scip.Relationship) []*scip.Relationship {
+	relationshipMap := make(map[string][]*scip.Relationship, len(relationships))
+	for _, relationship := range relationships {
+		relationshipMap[relationship.Symbol] = append(relationshipMap[relationship.Symbol], relationship)
+	}
+
+	flattened := make([]*scip.Relationship, 0, len(relationshipMap))
+	for _, relationships := range relationshipMap {
+		combined := relationships[0]
+		for _, relationship := range relationships[1:] {
+			combined.IsReference = combined.IsReference || relationship.IsReference
+			combined.IsImplementation = combined.IsImplementation || relationship.IsImplementation
+			combined.IsTypeDefinition = combined.IsTypeDefinition || relationship.IsTypeDefinition
+			combined.IsDefinition = combined.IsDefinition || relationship.IsDefinition
+		}
+
+		flattened = append(flattened, combined)
+	}
+
+	return flattened
+}
--- a/enterprise/internal/codeintel/shared/types/scip_sort.go
+++ b/enterprise/internal/codeintel/shared/types/scip_sort.go
@ -27,15 +27,38 @@ func FindOccurrences(occurrences []*scip.Occurrence, targetLine, targetCharacter

 // SortOccurrences sorts the given occurrence slice (in-place) and returns it (for convenience).
 // Occurrences sorted in ascending order of their range's starting position, where enclosing ranges
-// come before the enclosed.
+// come before the enclosed. If there are multiple occurrences with the exact same range, then the
+// occurrences are sorted by symbol name.
 func SortOccurrences(occurrences []*scip.Occurrence) []*scip.Occurrence {
 	sort.Slice(occurrences, func(i, j int) bool {
+		if rawRangesEqual(occurrences[i].Range, occurrences[j].Range) {
+			return occurrences[i].Symbol < occurrences[j].Symbol
+		}
+
 		return compareRanges(occurrences[i].Range, occurrences[j].Range...) <= 0
 	})

 	return occurrences
 }

+// rawRangesEqual compares the given SCIP-encoded raw ranges for equality.
+func rawRangesEqual(a, b []int32) bool {
+	if len(a) == len(b) {
+		for i, v := range a {
+			if v != b[i] {
+				return false
+			}
+		}
+
+		return true
+	}
+
+	ra := scip.NewRange(a)
+	rb := scip.NewRange(b)
+
+	return ra.Start.Line == rb.Start.Line && ra.Start.Character == rb.Start.Character && ra.End.Line == rb.End.Line && ra.End.Character == rb.End.Character
+}
+
 // SortRanges sorts the given range slice (in-place) and returns it (for convenience). Ranges are
 // sorted in ascending order of starting position, where enclosing ranges come before the enclosed.
 func SortRanges(ranges []*scip.Range) []*scip.Range {
@ -64,7 +87,7 @@ func SortSymbols(symbols []*scip.SymbolInformation) []*scip.SymbolInformation {
 }

 // SortDiagnostics sorts the given diagnostics slice (in-place) and returns it (for convenience).
-// Diagnostics are sorted firs tyb severity (more severe earlier in the slice) and then by the
+// Diagnostics are sorted first by severity (more severe earlier in the slice) and then by the
 // diagnostic message.
 func SortDiagnostics(diagnostics []*scip.Diagnostic) []*scip.Diagnostic {
 	sort.Slice(diagnostics, func(i, j int) bool {
@ -80,6 +103,24 @@ func SortDiagnostics(diagnostics []*scip.Diagnostic) []*scip.Diagnostic {
 	return diagnostics
 }

+// SortDiagnosticTags sorts the given diagnostic tags slice (in-place) and returns it (for convenience).
+func SortDiagnosticTags(tags []scip.DiagnosticTag) []scip.DiagnosticTag {
+	sort.Slice(tags, func(i, j int) bool {
+		return tags[i] < tags[j]
+	})
+
+	return tags
+}
+
+// SortRelationships sorts the given symbol relationships slice (in-place) and returns it (for convenience).
+func SortRelationships(relationships []*scip.Relationship) []*scip.Relationship {
+	sort.Slice(relationships, func(i, j int) bool {
+		return relationships[i].Symbol < relationships[j].Symbol
+	})
+
+	return relationships
+}
+
 // compareRanges compares the order of the leading edge of the two ranges. This method returns
 //
 // - -1 if the leading edge of r2 occurs before r1,