Add fuzzy file finder with keyboard shortcut t. (#20921)

Previously, there was no way to interactively search for a file that
matches a given fuzzy query. This commit adds a new modal that activates
on the keyboard shortcut `t` and allows the user to quickly open a file
that fuzzily matches a given query.

The implementation in this commit is specifically designed to handle
large repositories with >100k source files. The fuzzy finder achieves
low-latency by:

1) implementating all the filtering inside the browser on the
   client-side
2) using bloom filters to quickly discard buckets of filenames that are
   guaranteed to not match a given query.  This technique is used by the
   Scala language server and documented in this blog post
   https://scalameta.org/metals/blog/2019/01/22/bloom-filters.html

The downside of this approach:

1) it can take a while to first download all filenames in the
   repository, especially for large repos like chromium/chromium. It should be
   possible to optimize some parts of this on the backend.
2) the bloom filter algorithm is sometimes more strict compared to the
   fuzzy finder in VS Code and IntelliJ. There are techniques to make
   the fuzzy finder more fuzzy, for example we could make all-lowercase
   queries case-insensitive. It's also worth consider whether we want
   use a different filtering algorithm for smaller repositories (<25k source files)
This commit is contained in:
Ólafur Páll Geirsson 2021-05-25 09:48:33 +02:00 committed by GitHub
parent 8d3173b1b4
commit 39a494eb92
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
15 changed files with 1323 additions and 0 deletions

View File

@ -0,0 +1,145 @@
import { Shortcut } from '@slimsag/react-shortcuts'
import React, { useState } from 'react'
import { gql } from '@sourcegraph/shared/src/graphql/graphql'
import { useLocalStorage } from '@sourcegraph/shared/src/util/useLocalStorage'
import { requestGraphQL } from '../../backend/graphql'
import { FuzzySearch, SearchIndexing } from '../../fuzzyFinder/FuzzySearch'
import { FilesResult, FilesVariables } from '../../graphql-operations'
import {
KEYBOARD_SHORTCUT_CLOSE_FUZZY_FINDER,
KEYBOARD_SHORTCUT_FUZZY_FINDER,
} from '../../keyboardShortcuts/keyboardShortcuts'
import { FuzzyModal } from './FuzzyModal'
const DEFAULT_MAX_RESULTS = 100
export interface FuzzyFinderProps {
repoName: string
commitID: string
}
export const FuzzyFinder: React.FunctionComponent<FuzzyFinderProps> = props => {
const [isVisible, setIsVisible] = useState(false)
// NOTE: the query is cached in local storage to mimic the file pickers in
// IntelliJ (by default) and VS Code (when "Workbench > Quick Open >
// Preserve Input" is enabled).
const [query, setQuery] = useLocalStorage(`fuzzy-modal.query.${props.repoName}`, '')
// The "focus index" is the index of the file result that the user has
// select with up/down arrow keys. The focused item is highlighted and the
// window.location is moved to that URL when the user presses the enter key.
const [focusIndex, setFocusIndex] = useState(0)
// The maximum number of results to display in the fuzzy finder. For large
// repositories, a generic query like "src" may return thousands of results
// making DOM rendering slow. The user can increase this number by clicking
// on a button at the bottom of the result list.
const [maxResults, setMaxResults] = useState(DEFAULT_MAX_RESULTS)
// The state machine of the fuzzy finder. See `FuzzyFSM` for more details
// about the state transititions.
const [fsm, setFsm] = useState<FuzzyFSM>({ key: 'empty' })
return (
<>
<Shortcut
{...KEYBOARD_SHORTCUT_FUZZY_FINDER.keybindings[0]}
onMatch={() => {
setIsVisible(true)
const input = document.querySelector<HTMLInputElement>('#fuzzy-modal-input')
input?.focus()
input?.select()
}}
/>
<Shortcut {...KEYBOARD_SHORTCUT_CLOSE_FUZZY_FINDER.keybindings[0]} onMatch={() => setIsVisible(false)} />
{isVisible && (
<FuzzyModal
{...props}
isVisible={isVisible}
onClose={() => setIsVisible(false)}
query={query}
setQuery={setQuery}
focusIndex={focusIndex}
setFocusIndex={setFocusIndex}
maxResults={maxResults}
increaseMaxResults={() => setMaxResults(maxResults + DEFAULT_MAX_RESULTS)}
fsm={fsm}
setFsm={setFsm}
downloadFilenames={() => downloadFilenames(props)}
/>
)}
</>
)
}
/**
* The fuzzy finder modal is implemented as a state machine with the following transitions:
*
* ```
* [cached]
* v v
* Empty [uncached]> Downloading > Indexing > Ready
* > Failed
* ```
*
* - Empty: start state.
* - Downloading: downloading filenames from the remote server. The filenames
* are cached using the browser's CacheStorage, if available.
* - Indexing: processing the downloaded filenames. This step is usually
* instant, unless the repo is very large (>100k source files).
* In the torvalds/linux repo (~70k files), this step takes <1s
* on my computer but the chromium/chromium repo (~360k files)
* it takes ~3-5 seconds. This step is async so that the user can
* query against partially indexed results.
* - Ready: all filenames have been indexed.
* - Failed: something unexpected happened, the user can't fuzzy find files.
*/
export type FuzzyFSM = Empty | Downloading | Indexing | Ready | Failed
export interface Empty {
key: 'empty'
}
export interface Downloading {
key: 'downloading'
}
export interface Indexing {
key: 'indexing'
indexing: SearchIndexing
}
export interface Ready {
key: 'ready'
fuzzy: FuzzySearch
}
export interface Failed {
key: 'failed'
errorMessage: string
}
async function downloadFilenames(props: FuzzyFinderProps): Promise<string[]> {
const gqlResult = await requestGraphQL<FilesResult, FilesVariables>(
gql`
query Files($repository: String!, $commit: String!) {
repository(name: $repository) {
commit(rev: $commit) {
tree(recursive: true) {
files(first: 1000000, recursive: true) {
path
}
}
}
}
}
`,
{
repository: props.repoName,
commit: props.commitID,
}
).toPromise()
const filenames = gqlResult.data?.repository?.commit?.tree?.files?.map(file => file.path)
if (!filenames) {
throw new Error(JSON.stringify(gqlResult))
}
return filenames
}

View File

@ -0,0 +1,51 @@
.modal {
z-index: 1000;
position: fixed;
left: 0;
right: 0;
top: 0;
bottom: 0;
background-color: rgba(0, 0, 0, 0.5);
display: flex;
align-items: center;
justify-content: center;
}
.content {
width: 80vw;
background-color: var(--color-bg-2);
}
.header,
.footer {
padding: 1rem;
}
.body {
height: 80vh;
overflow-y: scroll;
padding: 0.5rem;
}
.input {
border: none;
width: 100%;
padding: 0.25rem;
font-size: 2em;
}
.results {
text-align: left;
list-style-type: none;
padding: unset;
color: var(--body-color);
}
.focused {
background-color: var(--color-bg-3);
}
.footer {
display: flex;
justify-content: space-between;
}

View File

@ -0,0 +1,359 @@
/* eslint-disable jsx-a11y/no-noninteractive-element-interactions */
// NOTE: the eslint-disable above can't be a eslint-disable-next-line because
// JSX syntax doesn't support comments on the line where it's needed.
import React from 'react'
import { CaseSensitiveFuzzySearch } from '../../fuzzyFinder/CaseSensitiveFuzzySearch'
import { FuzzySearch, FuzzySearchResult, SearchIndexing } from '../../fuzzyFinder/FuzzySearch'
import { FuzzyFinderProps, Indexing, FuzzyFSM } from './FuzzyFinder'
import styles from './FuzzyModal.module.scss'
import { HighlightedLink } from './HighlightedLink'
// Enable this URL query parameter to see debugging information like latency
// numbers and the false-positive ratio for the bloom filter.
const IS_DEBUG = window.location.href.toString().includes('fuzzyFinder=debug')
// Cache for the last fuzzy query. This value is only used to avoid redoing the
// full fuzzy search on every re-render when the user presses the down/up arrow
// keys to move the "focus index".
const lastFuzzySearchResult = new Map<string, FuzzySearchResult>()
// The number of results to jump by on PageUp/PageDown keyboard shortcuts.
const PAGE_DOWN_INCREMENT = 10
export interface FuzzyModalProps
extends VisibilityProps,
FuzzyFinderProps,
MaxResultsProps,
FocusIndexProps,
FuzzyFSMProps,
QueryProps {
downloadFilenames: () => Promise<string[]>
}
interface VisibilityProps {
isVisible: boolean
onClose: () => void
}
interface QueryProps {
query: string
setQuery: (query: string) => void
}
interface MaxResultsProps {
maxResults: number
increaseMaxResults: () => void
}
interface FocusIndexProps {
focusIndex: number
setFocusIndex: (focusIndex: number) => void
}
interface FuzzyFSMProps {
fsm: FuzzyFSM
setFsm: (fsm: FuzzyFSM) => void
}
/**
* Component that interactively displays filenames in the open repository when given fuzzy queries.
*
* Similar to "Go to file" in VS Code or the "t" keyboard shortcut on github.com
*/
export const FuzzyModal: React.FunctionComponent<FuzzyModalProps> = props => {
const fuzzyResult = renderFuzzyResult(props)
// Sets the new "focus index" so that it's rounded by the number of
// displayed filenames. Cycles so that the user can press-hold the down
// arrow and it goes all the way down and back up to the top result.
function setRoundedFocusIndex(increment: number): void {
const newNumber = props.focusIndex + increment
const index = newNumber % fuzzyResult.resultsCount
const nextIndex = index < 0 ? fuzzyResult.resultsCount + index : index
props.setFocusIndex(nextIndex)
document.querySelector(`#fuzzy-modal-result-${nextIndex}`)?.scrollIntoView(false)
}
function onInputKeyDown(event: React.KeyboardEvent): void {
switch (event.key) {
case 'Escape':
props.onClose()
break
case 'ArrowDown':
event.preventDefault() // Don't move the cursor to the end of the input.
setRoundedFocusIndex(1)
break
case 'PageDown':
setRoundedFocusIndex(PAGE_DOWN_INCREMENT)
break
case 'ArrowUp':
event.preventDefault() // Don't move the cursor to the start of the input.
setRoundedFocusIndex(-1)
break
case 'PageUp':
setRoundedFocusIndex(-PAGE_DOWN_INCREMENT)
break
case 'Enter':
if (props.focusIndex < fuzzyResult.resultsCount) {
const fileAnchor = document.querySelector<HTMLAnchorElement>(
`#fuzzy-modal-result-${props.focusIndex} a`
)
fileAnchor?.click()
props.onClose()
}
break
default:
}
}
return (
// Use 'onMouseDown' instead of 'onClick' to allow selecting the text and mouse up outside the modal
<div role="navigation" className={styles.modal} onMouseDown={() => props.onClose()}>
<div role="navigation" className={styles.content} onMouseDown={event => event.stopPropagation()}>
<div className={styles.header}>
<input
autoComplete="off"
id="fuzzy-modal-input"
className={styles.input}
value={props.query}
onChange={event => {
props.setQuery(event.target.value)
props.setFocusIndex(0)
}}
type="text"
onKeyDown={onInputKeyDown}
/>
</div>
<div className={styles.body}>{fuzzyResult.element}</div>
<div className={styles.footer}>
<button type="button" className="btn btn-secondary" onClick={() => props.onClose()}>
Close
</button>
{fuzzyFooter(props.fsm, fuzzyResult)}
</div>
</div>
</div>
)
}
function plural(what: string, count: number, isComplete: boolean): string {
return count.toLocaleString() + (isComplete ? '' : '+') + ' ' + what + (count === 1 ? '' : 's')
}
function fuzzyFooter(fsm: FuzzyFSM, files: RenderedFuzzyResult): JSX.Element {
return IS_DEBUG ? (
<>
<span>{files.falsePositiveRatio && Math.round(files.falsePositiveRatio * 100)}fp</span>
<span>{files.elapsedMilliseconds && Math.round(files.elapsedMilliseconds).toLocaleString()}ms</span>
</>
) : (
<>
<span>{plural('result', files.resultsCount, files.isComplete)}</span>
<span>
{fsm.key === 'indexing' && indexingProgressBar(fsm)}
{plural('total file', files.totalFileCount, true)}
</span>
</>
)
}
function indexingProgressBar(indexing: Indexing): JSX.Element {
const indexedFiles = indexing.indexing.indexedFileCount
const totalFiles = indexing.indexing.totalFileCount
const percentage = Math.round((indexedFiles / totalFiles) * 100)
return (
<progress value={indexedFiles} max={totalFiles}>
{percentage}%
</progress>
)
}
interface RenderedFuzzyResult {
element: JSX.Element
resultsCount: number
isComplete: boolean
totalFileCount: number
elapsedMilliseconds?: number
falsePositiveRatio?: number
}
function renderFuzzyResult(props: FuzzyModalProps): RenderedFuzzyResult {
function empty(element: JSX.Element): RenderedFuzzyResult {
return {
element,
resultsCount: 0,
isComplete: true,
totalFileCount: 0,
}
}
function onError(what: string): (error: Error) => void {
return error => {
props.setFsm({ key: 'failed', errorMessage: JSON.stringify(error) })
throw new Error(what)
}
}
const usuallyFast =
"This step is usually fast unless it's a very large repository. The result is cached so you only have to wait for it once :)"
switch (props.fsm.key) {
case 'empty':
handleEmpty(props).then(() => {}, onError('onEmpty'))
return empty(<></>)
case 'downloading':
return empty(<p>Downloading... {usuallyFast}</p>)
case 'failed':
return empty(<p>Error: {props.fsm.errorMessage}</p>)
case 'indexing': {
const loader = props.fsm.indexing
later()
.then(() => continueIndexing(loader))
.then(next => props.setFsm(next), onError('onIndexing'))
return renderFiles(props, props.fsm.indexing.partialFuzzy, props.fsm.indexing)
}
case 'ready':
return renderFiles(props, props.fsm.fuzzy)
default:
return empty(<p>ERROR</p>)
}
}
function renderFiles(props: FuzzyModalProps, search: FuzzySearch, indexing?: SearchIndexing): RenderedFuzzyResult {
const indexedFileCount = indexing ? indexing.indexedFileCount : ''
const cacheKey = `${props.query}-${props.maxResults}${indexedFileCount}`
let fuzzyResult = lastFuzzySearchResult.get(cacheKey)
if (!fuzzyResult) {
const start = window.performance.now()
fuzzyResult = search.search({
query: props.query,
maxResults: props.maxResults,
createUrl: filename => `/${props.repoName}@${props.commitID}/-/blob/${filename}`,
onClick: () => props.onClose(),
})
fuzzyResult.elapsedMilliseconds = window.performance.now() - start
lastFuzzySearchResult.clear() // Only cache the last query.
lastFuzzySearchResult.set(cacheKey, fuzzyResult)
}
const links = fuzzyResult.links
if (links.length === 0) {
return {
element: <p>No files matching '{props.query}'</p>,
resultsCount: 0,
totalFileCount: search.totalFileCount,
isComplete: fuzzyResult.isComplete,
}
}
const linksToRender = links.slice(0, props.maxResults)
return {
element: (
<ul className={`${styles.results} text-monospace`}>
{linksToRender.map((file, fileIndex) => (
<li
id={`fuzzy-modal-result-${fileIndex}`}
key={file.text}
className={fileIndex === props.focusIndex ? styles.focused : ''}
>
<HighlightedLink {...file} />
</li>
))}
{!fuzzyResult.isComplete && (
<li>
<button className="btn btn-seconday" type="button" onClick={() => props.increaseMaxResults()}>
(...truncated, click to show more results){' '}
</button>
</li>
)}
</ul>
),
resultsCount: linksToRender.length,
totalFileCount: search.totalFileCount,
isComplete: fuzzyResult.isComplete,
elapsedMilliseconds: fuzzyResult.elapsedMilliseconds,
falsePositiveRatio: fuzzyResult.falsePositiveRatio,
}
}
function filesCacheKey(props: FuzzyModalProps): string {
return `/fuzzy-modal.files.${props.repoName}.${props.commitID}`
}
function openCaches(): Promise<Cache> {
return caches.open('fuzzy-modal')
}
async function later(): Promise<void> {
return new Promise(resolve => setTimeout(() => resolve(), 0))
}
async function continueIndexing(indexing: SearchIndexing): Promise<FuzzyFSM> {
const next = await indexing.continue()
if (next.key === 'indexing') {
return { key: 'indexing', indexing: next }
}
return {
key: 'ready',
fuzzy: next.value,
}
}
async function loadCachedIndex(props: FuzzyModalProps): Promise<FuzzyFSM | undefined> {
const cacheAvailable = 'caches' in self
if (!cacheAvailable) {
return Promise.resolve(undefined)
}
const cacheKey = filesCacheKey(props)
const cache = await openCaches()
const cacheRequest = new Request(cacheKey)
const fromCache = await cache.match(cacheRequest)
if (!fromCache) {
return undefined
}
// eslint-disable-next-line @typescript-eslint/no-unsafe-assignment
const filenames = JSON.parse(await fromCache.text())
return handleFilenames(filenames)
}
async function cacheFilenames(props: FuzzyModalProps, filenames: string[]): Promise<void> {
const cacheAvailable = 'caches' in self
if (!cacheAvailable) {
return Promise.resolve()
}
const cacheKey = filesCacheKey(props)
const cache = await openCaches()
await cache.put(cacheKey, new Response(JSON.stringify(filenames)))
}
async function handleEmpty(props: FuzzyModalProps): Promise<void> {
const fromCache = await loadCachedIndex(props)
if (fromCache) {
props.setFsm(fromCache)
} else {
props.setFsm({ key: 'downloading' })
try {
const filenames = await props.downloadFilenames()
props.setFsm(handleFilenames(filenames))
cacheFilenames(props, filenames).then(
() => {},
() => {}
)
} catch (error) {
props.setFsm({
key: 'failed',
errorMessage: JSON.stringify(error),
})
}
}
}
function handleFilenames(filenames: string[]): FuzzyFSM {
const indexing = CaseSensitiveFuzzySearch.fromSearchValuesAsync(filenames.map(file => ({ text: file })))
if (indexing.key === 'ready') {
return {
key: 'ready',
fuzzy: indexing.value,
}
}
return {
key: 'indexing',
indexing,
}
}

View File

@ -0,0 +1,15 @@
.highlighted {
color: var(--oc-black);
}
.fuzzy {
background-color: var(--oc-yellow-4);
}
.exact {
background-color: var(--oc-yellow-3);
}
.link {
display: inline-block;
width: 100%;
height: 100%;
max-height: 1em;
}

View File

@ -0,0 +1,69 @@
import React from 'react'
import { Link } from 'react-router-dom'
import styles from './HighlightedLink.module.scss'
export interface RangePosition {
startOffset: number
endOffset: number
/**
* Does this range enclose an exact word?
*/
isExact: boolean
}
export interface HighlightedLinkProps {
text: string
positions: RangePosition[]
url?: string
onClick?: () => void
}
export function offsetSum(props: HighlightedLinkProps): number {
let sum = 0
for (const position of props.positions) {
sum += position.startOffset
}
return sum
}
/**
* React component that renders text with highlighted subranges.
*
* Used to render fuzzy finder results. For example, given the query "doc/read"
* we want to highlight 'Doc' and `READ' in the filename
* 'Documentation/README.md`.
*/
export const HighlightedLink: React.FunctionComponent<HighlightedLinkProps> = props => {
const spans: JSX.Element[] = []
let start = 0
function pushSpan(className: string, startOffset: number, endOffset: number): void {
if (startOffset >= endOffset) {
return
}
const text = props.text.slice(startOffset, endOffset)
const key = `${startOffset}-${endOffset}`
const span = (
<span key={key} className={className}>
{text}
</span>
)
spans.push(span)
}
for (const position of props.positions) {
if (position.startOffset > start) {
pushSpan('', start, position.startOffset)
}
start = position.endOffset
const classNameSuffix = position.isExact ? styles.exact : styles.fuzzy
pushSpan(`${styles.highlighted} ${classNameSuffix}`, position.startOffset, position.endOffset)
}
pushSpan('', start, props.text.length)
return props.url ? (
<Link className={styles.link} to={props.url} onClick={() => props.onClick?.()}>
{spans}
</Link>
) : (
<>{spans}</>
)
}

View File

@ -0,0 +1,96 @@
import { CaseSensitiveFuzzySearch, allFuzzyParts, fuzzyMatchesQuery } from './CaseSensitiveFuzzySearch'
import { FuzzySearchParameters } from './FuzzySearch'
const all = [
't1/README.md',
't2/Readme.md',
't1/READMES.md',
'.tsconfig.json',
'to/the/moon.jpg',
'lol/business.txt',
'haha/business.txt',
't3/siteConfig.json',
'business/crazy.txt',
'fuzzy/business.txt',
'.travis/workflows/config.json',
'test/WorkspaceSymbolProvider.scala',
]
const fuzzy = CaseSensitiveFuzzySearch.fromSearchValues(all.map(text => ({ text })))
function checkSearch(query: string, expected: string[]) {
test(`search-${query}`, () => {
const queryProps: FuzzySearchParameters = { query, maxResults: 1000 }
const actual = fuzzy.search(queryProps).links.map(link => link.text)
expect(actual).toStrictEqual(expected)
for (const result of expected) {
const individualFuzzy = CaseSensitiveFuzzySearch.fromSearchValues([{ text: result }])
const individualActual = individualFuzzy.search(queryProps).links.map(link => link.text)
expect(individualActual).toStrictEqual([result])
}
})
}
function checkParts(name: string, original: string, expected: string[]) {
test(`allFuzzyParts-${name}`, () => {
expect(allFuzzyParts(original, false)).toStrictEqual(expected)
})
}
function checkFuzzyMatch(name: string, query: string, value: string, expected: string[]) {
test(`fuzzyMatchesQuery-${name}`, () => {
const obtained = fuzzyMatchesQuery(query, value)
const parts: string[] = []
for (const position of obtained) {
parts.push(value.slice(position.startOffset, position.endOffset))
}
expect(parts).toStrictEqual(expected)
})
}
describe('case sensitive fuzzy search', () => {
describe('splitting a filename into parts works as expected', () => {
checkParts('basic', 'haha/business.txt', ['haha', 'business', 'txt'])
checkParts('snake_case', 'haha_business.txt', ['haha', 'business', 'txt'])
checkParts('camelCase', 'hahaBusiness.txt', ['haha', 'Business', 'txt'])
checkParts('CamelCase', 'HahaBusiness.txt', ['Haha', 'Business', 'txt'])
checkParts('kebab-case', 'haha-business.txt', ['haha', 'business', 'txt'])
checkParts('kebab-case', 'haha-business.txt', ['haha', 'business', 'txt'])
checkParts('dotfile', '.tsconfig.json', ['tsconfig', 'json'])
})
describe('fuzzy matching selects the correct substrings', () => {
checkFuzzyMatch('dotfile', 'ts', '.tsconfig.json', ['ts'])
checkFuzzyMatch('basic', 'ha/busi', 'haha/business.txt', ['ha', '/', 'busi'])
checkFuzzyMatch('all-lowercase', 'readme', 't1/README.md', ['README'])
checkFuzzyMatch('all-lowercase2', 'readme', 't2/Readme.md', ['Readme'])
checkFuzzyMatch('digits', 't2', 't2/Readme.md', ['t2'])
checkFuzzyMatch('consume-delimeter-negative', 'ts/json', '.tsconfig.json', [])
checkFuzzyMatch('consume-delimeter-positive', 'ts/json', '.tsconfig/json', ['ts', '/', 'json'])
checkFuzzyMatch('consume-delimeter-end-of-word', 'ts/', '.tsconfig/json', ['ts', '/'])
checkFuzzyMatch('consume-delimeter-start-of-word', '.ts/', '.tsconfig/json', ['.', 'ts', '/'])
})
describe('fuzzy searching against the bloom filter returns the correct results', () => {
checkSearch('h/bus', ['haha/business.txt'])
checkSearch('moon', ['to/the/moon.jpg'])
checkSearch('t/moon', ['to/the/moon.jpg'])
checkSearch('t/t/moon', ['to/the/moon.jpg'])
checkSearch('t.t.moon', [])
checkSearch('t t moon', [])
checkSearch('jpg', ['to/the/moon.jpg'])
checkSearch('t/mo', ['to/the/moon.jpg'])
checkSearch('mo', ['to/the/moon.jpg'])
checkSearch('t', all)
checkSearch('readme', ['t1/README.md', 't2/Readme.md', 't1/READMES.md'])
checkSearch('README', ['t1/README.md', 't1/READMES.md'])
checkSearch('Readme', ['t2/Readme.md'])
checkSearch('WSProvider', ['test/WorkspaceSymbolProvider.scala'])
checkSearch('t2', ['t2/Readme.md'])
})
describe('caveat: validate the fuzzy finder is quite strict with capitalization', () => {
checkSearch('sitecon', [])
checkFuzzyMatch('sitecon', 'sitecon', 'website/siteConfig.js', [])
})
})

View File

@ -0,0 +1,468 @@
import { BloomFilter } from 'bloomfilter'
import { HighlightedLinkProps, offsetSum, RangePosition } from '../components/fuzzyFinder/HighlightedLink'
import { FuzzySearch, IndexingFSM, FuzzySearchParameters, FuzzySearchResult, SearchValue } from './FuzzySearch'
import { Hasher } from './Hasher'
/**
* We don't index filenames with length larger than this value.
*/
const MAX_VALUE_LENGTH = 100
// Normally, you need multiple hash functions to keep the false-positive ratio
// low. However, non-empirical observations indicate that a single hash function
// works fine and provides the fastest indexing time in large repositories like
// Chromium.
const DEFAULT_BLOOM_FILTER_HASH_FUNCTION_COUNT = 1
// The number of filenames to group together in a single bucket, and the number
// string prefixes that each bloom can contain. Currently, every bucket can
// contain up to 262.144 prefixes (conservatively large number). With bucket
// size 50, my off-the-napkin calculation is that total memory usage with 400k
// files (Chromium size) may be as large as ~261mb. It's usable on most
// computers, but still a bit high.
// Tracking issue to fine-tune these parameters: https://github.com/sourcegraph/sourcegraph/issues/21201
const DEFAULT_BUCKET_SIZE = 50
const DEFAULT_BLOOM_FILTER_SIZE = 2 << 17
/**
* Returns true if the given query fuzzy matches the given value.
*/
export function fuzzyMatchesQuery(query: string, value: string): RangePosition[] {
return fuzzyMatches(allFuzzyParts(query, true), value)
}
/**
* Case-sensitive fuzzy search that uses bloom filters for low-latency filtering
* in large repositories (>100k files).
*
* NOTE(olafur): this is a reimplementation of the fuzzy finder in the Scala
* language server that's documented in this blog post here
* https://scalameta.org/metals/blog/2019/01/22/bloom-filters.html#fuzzy-symbol-search
*
* In a nutshell, bloom filters improve performance by allowing us to skip a
* "bucket" of candidate files if we know that bucket does not match any words
* in that query. For example, the query "SymPro" is split into the words "Sym"
* and "Pro". If a bucket of 500 words is guaranteed to have to appearances of
* the words "Sym" and "Pro", then we can skip those 500 words and move on to
* the next bucket.
*
* One downside of the bloom filter approach is that it requires an indexing
* phase that can take a couple of seconds to complete on a large input size
* (>100k filenames). The indexing phase can take a while to complete because we
* need to compute all possible words that the user may query. For example,
* given the filename "SymbolProvider", we create a bloom filter with all
* possible prefixes of "Symbol" and "Provider". Fortunately, bloom filters can be
* serialized so that the indexing step only runs once per repoName/commitID pair.
*/
export class CaseSensitiveFuzzySearch extends FuzzySearch {
public totalFileCount = 0
constructor(public readonly buckets: Bucket[]) {
super()
for (const bucket of buckets) {
this.totalFileCount += bucket.files.length
}
}
public static fromSearchValuesAsync(files: SearchValue[], bucketSize: number = DEFAULT_BUCKET_SIZE): IndexingFSM {
files.sort((a, b) => a.text.length - b.text.length)
const indexer = new Indexer(files, bucketSize)
function loop(): IndexingFSM {
if (indexer.isDone()) {
return { key: 'ready', value: indexer.complete() }
}
indexer.processBuckets(25000)
return {
key: 'indexing',
indexedFileCount: indexer.indexedFileCount(),
totalFileCount: indexer.totalFileCount(),
partialFuzzy: indexer.complete(),
continue: () => new Promise(resolve => resolve(loop())),
}
}
return loop()
}
public static fromSearchValues(
files: SearchValue[],
bucketSize: number = DEFAULT_BUCKET_SIZE
): CaseSensitiveFuzzySearch {
const indexer = new Indexer(files, bucketSize)
while (!indexer.isDone()) {
indexer.processBuckets(bucketSize)
}
return indexer.complete()
}
public search(query: FuzzySearchParameters): FuzzySearchResult {
if (query.query.length === 0) {
return this.emptyResult(query)
}
let falsePositives = 0
const result: HighlightedLinkProps[] = []
const hashParts = allQueryHashParts(query.query)
const queryParts = allFuzzyParts(query.query, true)
const complete = (isComplete: boolean): FuzzySearchResult =>
this.sorted({ links: result, isComplete, falsePositiveRatio: falsePositives / this.buckets.length })
for (const bucket of this.buckets) {
const matches = bucket.matches(query, queryParts, hashParts)
if (!matches.skipped && matches.value.length === 0) {
falsePositives++
}
for (const value of matches.value) {
if (result.length >= query.maxResults) {
return complete(false)
}
result.push(value)
}
}
return complete(true)
}
private sorted(result: FuzzySearchResult): FuzzySearchResult {
result.links.sort((a, b) => {
const byLength = a.text.length - b.text.length
if (byLength !== 0) {
return byLength
}
const byEarliestMatch = offsetSum(a) - offsetSum(b)
if (byEarliestMatch !== 0) {
return byEarliestMatch
}
return a.text.localeCompare(b.text)
})
return result
}
private emptyResult(query: FuzzySearchParameters): FuzzySearchResult {
const result: HighlightedLinkProps[] = []
const complete = (isComplete: boolean): FuzzySearchResult => this.sorted({ links: result, isComplete })
for (const bucket of this.buckets) {
if (result.length > query.maxResults) {
return complete(false)
}
for (const value of bucket.files) {
result.push({
text: value.text,
positions: [],
url: query.createUrl ? query.createUrl(value.text) : undefined,
onClick: query.onClick,
})
if (result.length > query.maxResults) {
return complete(false)
}
}
}
return complete(true)
}
}
export function allFuzzyParts(value: string, includeDelimeters: boolean): string[] {
const buf: string[] = []
let start = 0
let end = 0
while (end < value.length) {
if (end > start) {
buf.push(value.slice(start, end))
}
while (end < value.length && isDelimeter(value[end])) {
if (includeDelimeters) {
buf.push(value[end])
}
end++
}
start = end
end = nextFuzzyPart(value, end + 1)
}
if (start < value.length && end > start) {
buf.push(value.slice(start, end))
}
return buf
}
function isDigit(value: string): boolean {
return value >= '0' && value <= '9'
}
function isLowercaseCharacter(value: string): boolean {
return isLowercaseOrDigit(value) && !isDelimeter(value)
}
function isLowercaseOrDigit(value: string): boolean {
return isDigit(value) || (value.toLowerCase() === value && value !== value.toUpperCase())
}
function isUppercaseCharacter(value: string): boolean {
return isUppercase(value) && !isDelimeter(value)
}
function isUppercase(value: string): boolean {
return value.toUpperCase() === value && value !== value.toLowerCase()
}
function isDelimeterOrUppercase(character: string): boolean {
return isDelimeter(character) || isUppercase(character)
}
function isDelimeter(character: string): boolean {
switch (character) {
case '/':
case '_':
case '-':
case '.':
case ' ':
return true
default:
return false
}
}
function fuzzyMatches(queries: string[], value: string): RangePosition[] {
const result: RangePosition[] = []
const matcher = new FuzzyMatcher(queries, value)
while (!matcher.isDone()) {
const isCurrentQueryDelimeter = matcher.isQueryDelimeter()
while (!matcher.isQueryDelimeter() && matcher.isStartDelimeter()) {
matcher.start++
}
if (matcher.matchesFromStart()) {
result.push(matcher.rangePositionFromStart())
matcher.queryIndex++
}
matcher.start = matcher.nextStart(isCurrentQueryDelimeter)
}
return matcher.queryIndex >= queries.length ? result : []
}
class FuzzyMatcher {
public queryIndex = 0
public start = 0
private lowercaseValue: string
constructor(private readonly queries: string[], private readonly value: string) {
this.lowercaseValue = value.toLowerCase()
}
public nextStart(isCurrentQueryDelimeter: boolean): number {
const offset = isCurrentQueryDelimeter ? this.start : this.start + 1
let end = this.isQueryDelimeter()
? this.indexOfDelimeter(this.query(), offset)
: nextFuzzyPart(this.value, offset)
while (end < this.value.length && !this.isQueryDelimeter() && isDelimeter(this.value[end])) {
end++
}
return end
}
public rangePositionFromStart(): RangePosition {
const end = this.start + this.query().length
return {
startOffset: this.start,
endOffset: end,
isExact: end >= this.value.length || startsNewWord(this.value, end),
}
}
public matchesFromStart(): boolean {
const caseInsensitive = this.isCaseInsensitive()
const compareValue = caseInsensitive ? this.lowercaseValue : this.value
return (
compareValue.startsWith(this.query(), this.start) &&
(!caseInsensitive || isCapitalizedPart(this.value, this.start, this.query()))
)
}
public isStartDelimeter(): boolean {
return isDelimeter(this.value[this.start])
}
public isDone(): boolean {
return this.queryIndex >= this.queries.length || this.start >= this.value.length
}
public query(): string {
return this.queries[this.queryIndex]
}
public isCaseInsensitive(): boolean {
return isLowercaseOrDigit(this.query())
}
public isQueryDelimeter(): boolean {
return isDelimeter(this.query())
}
public indexOfDelimeter(delim: string, start: number): number {
const index = this.value.indexOf(delim, start)
return index < 0 ? this.value.length : index
}
}
function startsNewWord(value: string, index: number): boolean {
return (
isDelimeterOrUppercase(value[index]) ||
(isLowercaseCharacter(value[index]) && !isLowercaseCharacter(value[index - 1]))
)
}
/**
* Returns true if value.substring(start, start + query.length) is "properly capitalized".
*
* The string is properly capitalized as long it contains no lowercase character
* that is followed by an uppercase character. For example:
*
* - Not properly capitalized: "InnerClasses" "innerClasses"
* - Properly capitalized: "Innerclasses" "INnerclasses"
*/
function isCapitalizedPart(value: string, start: number, query: string): boolean {
let previousIsLowercase = false
for (let index = start; index < value.length && index - start < query.length; index++) {
const nextIsLowercase = isLowercaseOrDigit(value[index])
if (previousIsLowercase && !nextIsLowercase) {
return false
}
previousIsLowercase = nextIsLowercase
}
return true
}
function nextFuzzyPart(value: string, start: number): number {
let end = start
while (end < value.length && !isDelimeterOrUppercase(value[end])) {
end++
}
return end
}
function populateBloomFilter(values: SearchValue[]): BloomFilter {
const hashes = new BloomFilter(DEFAULT_BLOOM_FILTER_SIZE, DEFAULT_BLOOM_FILTER_HASH_FUNCTION_COUNT)
for (const value of values) {
if (value.text.length < MAX_VALUE_LENGTH) {
updateHashParts(value.text, hashes)
}
}
return hashes
}
function allQueryHashParts(query: string): number[] {
const fuzzyParts = allFuzzyParts(query, false)
const result: number[] = []
const hasher = new Hasher()
for (const part of fuzzyParts) {
hasher.reset()
for (const character of part) {
hasher.update(character)
result.push(hasher.digest())
}
}
return result
}
function updateHashParts(value: string, buf: BloomFilter): void {
const words = new Hasher()
const lowercaseWords = new Hasher()
for (let index = 0; index < value.length; index++) {
const character = value[index]
if (isDelimeterOrUppercase(character)) {
words.reset()
lowercaseWords.reset()
if (isUppercaseCharacter(character) && (index === 0 || !isUppercaseCharacter(value[index - 1]))) {
let uppercaseWordIndex = index
const upper = []
while (uppercaseWordIndex < value.length && isUppercaseCharacter(value[uppercaseWordIndex])) {
upper.push(value[uppercaseWordIndex])
lowercaseWords.update(value[uppercaseWordIndex].toLowerCase())
buf.add(lowercaseWords.digest())
uppercaseWordIndex++
}
lowercaseWords.reset()
}
}
if (isDelimeter(character)) {
continue
}
words.update(character)
lowercaseWords.update(character.toLowerCase())
buf.add(words.digest())
if (words.digest() !== lowercaseWords.digest()) {
buf.add(lowercaseWords.digest())
}
}
}
interface BucketResult {
skipped: boolean
value: HighlightedLinkProps[]
}
class Bucket {
constructor(
public readonly files: SearchValue[],
public readonly filter: BloomFilter,
public readonly id: number
) {}
public static fromSearchValues(files: SearchValue[]): Bucket {
files.sort((a, b) => a.text.length - b.text.length)
return new Bucket(files, populateBloomFilter(files), Math.random())
}
private matchesMaybe(hashParts: number[]): boolean {
for (const part of hashParts) {
if (!this.filter.test(part)) {
return false
}
}
return true
}
public matches(query: FuzzySearchParameters, queryParts: string[], hashParts: number[]): BucketResult {
const matchesMaybe = this.matchesMaybe(hashParts)
if (!matchesMaybe) {
return { skipped: true, value: [] }
}
const result: HighlightedLinkProps[] = []
for (const file of this.files) {
const positions = fuzzyMatches(queryParts, file.text)
if (positions.length > 0) {
result.push({
text: file.text,
positions,
url: query.createUrl ? query.createUrl(file.text) : undefined,
onClick: query.onClick,
})
}
}
return { skipped: false, value: result }
}
}
class Indexer {
private buffer: SearchValue[] = []
private buckets: Bucket[] = []
private index = 0
constructor(private readonly files: SearchValue[], private readonly bucketSize: number) {
this.files.sort((a, b) => a.text.length - b.text.length)
}
public complete(): CaseSensitiveFuzzySearch {
return new CaseSensitiveFuzzySearch(this.buckets)
}
public isDone(): boolean {
return this.index >= this.files.length
}
public totalFileCount(): number {
return this.files.length
}
public indexedFileCount(): number {
return this.index
}
public processBuckets(fileCount: number): void {
let bucketCount = fileCount / this.bucketSize
while (bucketCount > 0 && !this.isDone()) {
const endIndex = Math.min(this.files.length, this.index + this.bucketSize)
while (this.index < endIndex) {
this.buffer.push(this.files[this.index])
this.index++
}
if (this.buffer) {
this.buckets.push(Bucket.fromSearchValues(this.buffer))
this.buffer = []
}
bucketCount--
}
}
}

View File

@ -0,0 +1,48 @@
import { HighlightedLinkProps } from '../components/fuzzyFinder/HighlightedLink'
export interface FuzzySearchParameters {
query: string
maxResults: number
createUrl?: (value: string) => string
onClick?: () => void
}
export interface FuzzySearchResult {
links: HighlightedLinkProps[]
isComplete: boolean
elapsedMilliseconds?: number
falsePositiveRatio?: number
}
export interface SearchValue {
text: string
}
export type IndexingFSM = SearchIndexing | SearchReady
export interface SearchIndexing {
key: 'indexing'
indexedFileCount: number
totalFileCount: number
partialFuzzy: FuzzySearch
continue: () => Promise<IndexingFSM>
}
export interface SearchReady {
key: 'ready'
value: FuzzySearch
}
/**
* Superclass for different fuzzy finding algorithms.
*
* Currently, there is only one implementation that is case sensitive. This
* implementation is specifically tailored for large repos that have >400k
* source files. Most users will likely prefer case-insensitive fuzzy filtering,
* which is easy to support for small repos (<20k files) but it's not clear how
* to support that in larger repos without sacrificing latency.
*
* Tracking issue to add case-insensitive search: https://github.com/sourcegraph/sourcegraph/issues/21201
*/
export abstract class FuzzySearch {
public abstract totalFileCount: number
public abstract search(parameters: FuzzySearchParameters): FuzzySearchResult
}

View File

@ -0,0 +1,29 @@
/**
* Computes the hashcode from a streaming input of characters. Every hashcode is
* computed in O(1) time.
*
* This class makes it possible to compute a hashcode for every prefix of a
* given string of length N in O(N) time. For example, given the string "Doc",
* we can compute the hashcode for the string "D", "Do" and "Doc" in three
* constant operations. If implemented naively, computing every individual
* hashcode would be a linear operation resulting in a total runtime of O(N^2).
*/
export class Hasher {
private currentHash = 0
public update(character: string): Hasher {
for (let index = 0; index < character.length; index++) {
this.currentHash = (Math.imul(31, this.currentHash) + character.charCodeAt(index)) | 0
}
return this
}
public digest(): number {
return this.currentHash
}
public resetWith(character: string): void {
this.reset()
this.update(character)
}
public reset(): void {
this.currentHash = 0
}
}

View File

@ -27,6 +27,18 @@ export const KEYBOARD_SHORTCUT_FOCUS_SEARCHBAR: KeyboardShortcut = {
keybindings: [{ ordered: ['/'] }],
}
export const KEYBOARD_SHORTCUT_FUZZY_FINDER: KeyboardShortcut = {
id: 'fuzzyFinder',
title: 'Fuzzy search files',
keybindings: [{ ordered: ['t'] }],
}
export const KEYBOARD_SHORTCUT_CLOSE_FUZZY_FINDER: KeyboardShortcut = {
id: 'closeFuzzyFiles',
title: 'Close fuzzy search files',
keybindings: [{ ordered: ['Escape'] }],
}
export const KEYBOARD_SHORTCUT_COPY_FULL_QUERY: KeyboardShortcut = {
id: 'copyFullQuery',
title: 'Copy full query',
@ -42,6 +54,8 @@ export const KEYBOARD_SHORTCUTS: KeyboardShortcut[] = [
KEYBOARD_SHORTCUT_SWITCH_THEME,
KEYBOARD_SHORTCUT_SHOW_HELP,
KEYBOARD_SHORTCUT_FOCUS_SEARCHBAR,
KEYBOARD_SHORTCUT_FUZZY_FINDER,
KEYBOARD_SHORTCUT_CLOSE_FUZZY_FINDER,
KEYBOARD_SHORTCUT_COPY_FULL_QUERY,
]

View File

@ -35,6 +35,7 @@ import { AuthenticatedUser } from '../auth'
import { ErrorMessage } from '../components/alerts'
import { BreadcrumbSetters, BreadcrumbsProps } from '../components/Breadcrumbs'
import { ErrorBoundary } from '../components/ErrorBoundary'
import { FuzzyFinder } from '../components/fuzzyFinder/FuzzyFinder'
import { HeroPage } from '../components/HeroPage'
import { ActionItemsBarProps, useWebActionItems } from '../extensions/components/ActionItemsBar'
import { ExternalLinkFields, RepositoryFields } from '../graphql-operations'
@ -383,6 +384,12 @@ export const RepoContainer: React.FunctionComponent<RepoContainerProps> = props
return (
<div className="repo-container test-repo-container w-100 d-flex flex-column">
{!isErrorLike(props.settingsCascade.final) &&
props.settingsCascade.final?.experimentalFeatures?.fuzzyFinder &&
resolvedRevisionOrError &&
!isErrorLike(resolvedRevisionOrError) && (
<FuzzyFinder repoName={repoName} commitID={resolvedRevisionOrError.commitID} />
)}
{showExtensionAlert && (
<InstallBrowserExtensionAlert
isChrome={IS_CHROME}

View File

@ -131,6 +131,7 @@
"@testing-library/react": "^10.4.8",
"@testing-library/react-hooks": "^3.4.1",
"@types/babel__core": "7.1.12",
"@types/bloomfilter": "^0.0.0",
"@types/chai": "4.2.14",
"@types/chai-as-promised": "7.1.3",
"@types/chrome": "0.0.127",
@ -316,6 +317,7 @@
"@visx/mock-data": "^1.7.0",
"@visx/scale": "^1.7.0",
"@visx/xychart": "^1.7.3",
"bloomfilter": "^0.0.18",
"bootstrap": "^4.5.2",
"classnames": "^2.2.6",
"comlink": "^4.3.0",

View File

@ -1302,6 +1302,8 @@ type SettingsExperimentalFeatures struct {
EnableFastResultLoading *bool `json:"enableFastResultLoading,omitempty"`
// EnableSmartQuery description: Enables contextual syntax highlighting and hovers for search queries in the web app
EnableSmartQuery *bool `json:"enableSmartQuery,omitempty"`
// FuzzyFinder description: Enables fuzzy finder with keyboard shortcut `t`.
FuzzyFinder *bool `json:"fuzzyFinder,omitempty"`
// SearchStats description: Enables a new page that shows language statistics about the results for a search query.
SearchStats *bool `json:"searchStats,omitempty"`
// SearchStreaming description: Enables experimental streaming support.

View File

@ -147,6 +147,14 @@
"!go": {
"pointer": true
}
},
"fuzzyFinder": {
"description": "Enables fuzzy finder with keyboard shortcut `t`.",
"type": "boolean",
"default": false,
"!go": {
"pointer": true
}
}
},
"group": "Experimental"

View File

@ -3817,6 +3817,11 @@
dependencies:
"@babel/types" "^7.3.0"
"@types/bloomfilter@^0.0.0":
version "0.0.0"
resolved "https://registry.npmjs.org/@types/bloomfilter/-/bloomfilter-0.0.0.tgz#fbfdf4fbf1d9f4775ee0ef952e58b138e5dae423"
integrity sha512-bZhGgidbIsN1sgI3tLwyxTpSa9Dkd9V5MrDU/O92OiqqEN+CNOxbyzPYAFOoBFw4VAh2EdzqT4XucKZEkrI0JQ==
"@types/body-parser@*":
version "1.17.0"
resolved "https://registry.npmjs.org/@types/body-parser/-/body-parser-1.17.0.tgz#9f5c9d9bd04bb54be32d5eb9fc0d8c974e6cf58c"
@ -6729,6 +6734,11 @@ blob@0.0.5:
resolved "https://registry.npmjs.org/blob/-/blob-0.0.5.tgz#d680eeef25f8cd91ad533f5b01eed48e64caf683"
integrity sha512-gaqbzQPqOoamawKg0LGVd7SzLgXS+JH61oWprSLH+P+abTczqJbhTR8CmJ2u9/bUYNmHTGJx/UEmn6doAvvuig==
bloomfilter@^0.0.18:
version "0.0.18"
resolved "https://registry.npmjs.org/bloomfilter/-/bloomfilter-0.0.18.tgz#6d55d34f0a214b235287b4eac9203ac623413dab"
integrity sha512-CbnyHE78gY1tpXS/Ap+B0RJxKdRWCDzjBnX97UJSG8rdLv1PK8GiTWc/CCQyWu6PWVD4lUceeFrqC6Mf3nMgOA==
bluebird-retry@^0.11.0:
version "0.11.0"
resolved "https://registry.npmjs.org/bluebird-retry/-/bluebird-retry-0.11.0.tgz#1289ab22cbbc3a02587baad35595351dd0c1c047"