search: Fix/improve client side query parser (#46588)

* search: Fix/improve client side query parser

The current implement incorrectly handles "leaves". For example 'a b OR c'
is parsed as

{
  type: 'operator',
  operands: [
    {type: 'pattern', value: 'a'},
    {type: 'pattern', value: 'b'},
    {type: 'pattern', value: 'c'},
  ]
}

which is wrong, or at least imprecise. Without more information it's not
clear whether this node represents 'a OR b c' or 'a b OR c'.

With this commit consecutive "leaves" are represented as a "sequence"
instead (see test output changes).
This commit is contained in:
Felix Kling 2023-01-18 13:54:36 +01:00 committed by GitHub
parent 6e3abcef6a
commit c9b27fb69a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 635 additions and 153 deletions

View File

@ -12,45 +12,54 @@ describe('parseSearchQuery', () => {
expect(parse('repo:foo a b c')).toMatchInlineSnapshot(`
[
{
"type": "parameter",
"field": "repo",
"value": "foo",
"negated": false,
"type": "sequence",
"nodes": [
{
"type": "parameter",
"field": "repo",
"value": "foo",
"negated": false,
"range": {
"start": 0,
"end": 8
}
},
{
"type": "pattern",
"kind": 1,
"value": "a",
"quoted": false,
"negated": false,
"range": {
"start": 9,
"end": 10
}
},
{
"type": "pattern",
"kind": 1,
"value": "b",
"quoted": false,
"negated": false,
"range": {
"start": 11,
"end": 12
}
},
{
"type": "pattern",
"kind": 1,
"value": "c",
"quoted": false,
"negated": false,
"range": {
"start": 13,
"end": 14
}
}
],
"range": {
"start": 0,
"end": 8
}
},
{
"type": "pattern",
"kind": 1,
"value": "a",
"quoted": false,
"negated": false,
"range": {
"start": 9,
"end": 10
}
},
{
"type": "pattern",
"kind": 1,
"value": "b",
"quoted": false,
"negated": false,
"range": {
"start": 11,
"end": 12
}
},
{
"type": "pattern",
"kind": 1,
"value": "c",
"quoted": false,
"negated": false,
"range": {
"start": 13,
"end": 14
}
}
@ -64,24 +73,33 @@ describe('parseSearchQuery', () => {
"type": "operator",
"operands": [
{
"type": "pattern",
"kind": 1,
"value": "a",
"quoted": false,
"negated": false,
"type": "sequence",
"nodes": [
{
"type": "pattern",
"kind": 1,
"value": "a",
"quoted": false,
"negated": false,
"range": {
"start": 0,
"end": 1
}
},
{
"type": "pattern",
"kind": 1,
"value": "b",
"quoted": false,
"negated": false,
"range": {
"start": 2,
"end": 3
}
}
],
"range": {
"start": 0,
"end": 1
}
},
{
"type": "pattern",
"kind": 1,
"value": "b",
"quoted": false,
"negated": false,
"range": {
"start": 2,
"end": 3
}
},
@ -144,8 +162,35 @@ describe('parseSearchQuery', () => {
]
`))
test('query with not', () =>
expect(parse('not a')).toMatchInlineSnapshot(`
[
{
"type": "operator",
"operands": [
{
"type": "pattern",
"kind": 1,
"value": "a",
"quoted": false,
"negated": false,
"range": {
"start": 4,
"end": 5
}
}
],
"kind": "NOT",
"range": {
"start": 0,
"end": 5
}
}
]
`))
test('query with and/or operator precedence', () =>
expect(parse('a or b and c')).toMatchInlineSnapshot(`
expect(parse('a or b and not c')).toMatchInlineSnapshot(`
[
{
"type": "operator",
@ -176,28 +221,38 @@ describe('parseSearchQuery', () => {
}
},
{
"type": "pattern",
"kind": 1,
"value": "c",
"quoted": false,
"negated": false,
"type": "operator",
"operands": [
{
"type": "pattern",
"kind": 1,
"value": "c",
"quoted": false,
"negated": false,
"range": {
"start": 15,
"end": 16
}
}
],
"kind": "NOT",
"range": {
"start": 11,
"end": 12
"end": 16
}
}
],
"kind": "AND",
"range": {
"start": 5,
"end": 12
"end": 16
}
}
],
"kind": "OR",
"range": {
"start": 0,
"end": 12
"end": 16
}
}
]
@ -327,6 +382,149 @@ describe('parseSearchQuery', () => {
}
]
`)
expect(parse('not (a or b) and c')).toMatchInlineSnapshot(`
[
{
"type": "operator",
"operands": [
{
"type": "operator",
"operands": [
{
"type": "operator",
"operands": [
{
"type": "pattern",
"kind": 1,
"value": "a",
"quoted": false,
"negated": false,
"range": {
"start": 5,
"end": 6
}
},
{
"type": "pattern",
"kind": 1,
"value": "b",
"quoted": false,
"negated": false,
"range": {
"start": 10,
"end": 11
}
}
],
"kind": "OR",
"range": {
"start": 5,
"end": 11
},
"groupRange": {
"start": 4,
"end": 12
}
}
],
"kind": "NOT",
"range": {
"start": 0,
"end": 12
}
},
{
"type": "pattern",
"kind": 1,
"value": "c",
"quoted": false,
"negated": false,
"range": {
"start": 17,
"end": 18
}
}
],
"kind": "AND",
"range": {
"start": 0,
"end": 18
}
}
]
`)
expect(parse('not (a and b) or c')).toMatchInlineSnapshot(`
[
{
"type": "operator",
"operands": [
{
"type": "operator",
"operands": [
{
"type": "operator",
"operands": [
{
"type": "pattern",
"kind": 1,
"value": "a",
"quoted": false,
"negated": false,
"range": {
"start": 5,
"end": 6
}
},
{
"type": "pattern",
"kind": 1,
"value": "b",
"quoted": false,
"negated": false,
"range": {
"start": 11,
"end": 12
}
}
],
"kind": "AND",
"range": {
"start": 5,
"end": 12
},
"groupRange": {
"start": 4,
"end": 13
}
}
],
"kind": "NOT",
"range": {
"start": 0,
"end": 13
}
},
{
"type": "pattern",
"kind": 1,
"value": "c",
"quoted": false,
"negated": false,
"range": {
"start": 17,
"end": 18
}
}
],
"kind": "OR",
"range": {
"start": 0,
"end": 18
}
}
]
`)
})
test('query with nested parentheses', () =>
@ -396,56 +594,196 @@ describe('parseSearchQuery', () => {
]
`))
test('query with mixed parenthesis and implicit AND', () =>
expect(parse('(a AND b) c d')).toMatchInlineSnapshot(`
[
{
"type": "sequence",
"nodes": [
{
"type": "operator",
"operands": [
{
"type": "pattern",
"kind": 1,
"value": "a",
"quoted": false,
"negated": false,
"range": {
"start": 1,
"end": 2
}
},
{
"type": "pattern",
"kind": 1,
"value": "b",
"quoted": false,
"negated": false,
"range": {
"start": 7,
"end": 8
}
}
],
"kind": "AND",
"range": {
"start": 1,
"end": 8
},
"groupRange": {
"start": 0,
"end": 9
}
},
{
"type": "pattern",
"kind": 1,
"value": "c",
"quoted": false,
"negated": false,
"range": {
"start": 10,
"end": 11
}
},
{
"type": "pattern",
"kind": 1,
"value": "d",
"quoted": false,
"negated": false,
"range": {
"start": 12,
"end": 13
}
}
],
"range": {
"start": 0,
"end": 13
}
}
]
`))
test('query with mixed explicit OR and implicit AND operators', () =>
expect(parse('aaa bbb or ccc')).toMatchInlineSnapshot(`
[
{
"type": "operator",
"operands": [
{
"type": "sequence",
"nodes": [
{
"type": "pattern",
"kind": 1,
"value": "aaa",
"quoted": false,
"negated": false,
"range": {
"start": 0,
"end": 3
}
},
{
"type": "pattern",
"kind": 1,
"value": "bbb",
"quoted": false,
"negated": false,
"range": {
"start": 4,
"end": 7
}
}
],
"range": {
"start": 0,
"end": 7
}
},
{
"type": "pattern",
"kind": 1,
"value": "ccc",
"quoted": false,
"negated": false,
"range": {
"start": 11,
"end": 14
}
}
],
"kind": "OR",
"range": {
"start": 0,
"end": 14
}
}
]
`))
test('query with mixed explicit and implicit operators inside parens', () =>
expect(parse('(aaa bbb and ccc)')).toMatchInlineSnapshot(`
[
{
"type": "operator",
"operands": [
{
"type": "pattern",
"kind": 1,
"value": "aaa",
"quoted": false,
"negated": false,
"range": {
"start": 1,
"end": 4
}
},
{
"type": "pattern",
"kind": 1,
"value": "bbb",
"quoted": false,
"negated": false,
"range": {
"start": 5,
"end": 8
}
},
{
"type": "pattern",
"kind": 1,
"value": "ccc",
"quoted": false,
"negated": false,
"range": {
"start": 13,
"end": 16
}
}
],
"kind": "AND",
"range": {
"start": 1,
"end": 16
},
"groupRange": {
"start": 0,
"end": 17
}
}
]
`))
[
{
"type": "operator",
"operands": [
{
"type": "sequence",
"nodes": [
{
"type": "pattern",
"kind": 1,
"value": "aaa",
"quoted": false,
"negated": false,
"range": {
"start": 1,
"end": 4
}
},
{
"type": "pattern",
"kind": 1,
"value": "bbb",
"quoted": false,
"negated": false,
"range": {
"start": 5,
"end": 8
}
}
],
"range": {
"start": 1,
"end": 8
}
},
{
"type": "pattern",
"kind": 1,
"value": "ccc",
"quoted": false,
"negated": false,
"range": {
"start": 13,
"end": 16
}
}
],
"kind": "AND",
"range": {
"start": 1,
"end": 16
},
"groupRange": {
"start": 0,
"end": 17
}
}
]
`))
})

View File

@ -18,9 +18,21 @@ export interface Parameter {
range: CharacterRange
}
/**
* A Sequence represent a sequence of nodes, i.e. 'a b c'. While such as
* sequence is often thought about as "implicit AND", it's usually _not_
* equivalent to 'a AND b AND c', which is why this gets its own node type.
*/
export interface Sequence {
type: 'sequence'
nodes: Node[]
range: CharacterRange
}
export enum OperatorKind {
Or = 'OR',
And = 'AND',
Not = 'NOT',
}
/**
@ -31,11 +43,13 @@ export interface Operator {
operands: Node[]
kind: OperatorKind
range: CharacterRange
// Position in the query string including parenthesis if used
/**
* Position in the query string including parenthesis if used
*/
groupRange?: CharacterRange
}
export type Node = Operator | Parameter | Pattern
export type Node = Sequence | Operator | Parameter | Pattern
interface ParseError {
type: 'error'
@ -57,21 +71,8 @@ interface State {
tokens: Token[]
}
const createNodes = (nodes: Node[]): ParseSuccess => ({ type: 'success', nodes })
const createPattern = (
value: string,
kind: PatternKind,
quoted: boolean,
negated: boolean,
range: CharacterRange
): ParseSuccess => createNodes([{ type: 'pattern', kind, value, quoted, negated, range }])
const createParameter = (field: string, value: string, negated: boolean, range: CharacterRange): ParseSuccess =>
createNodes([{ type: 'parameter', field, value, negated, range }])
const createOperator = (nodes: Node[], kind: OperatorKind): ParseSuccess => {
const range: CharacterRange = nodes.reduce(
const rangeFromNodes = (nodes: Node[]): CharacterRange =>
nodes.reduce(
(range, node) => {
const nodeRange = node.type === 'operator' ? node.groupRange ?? node.range : node.range
if (nodeRange.start < range.start) {
@ -85,6 +86,34 @@ const createOperator = (nodes: Node[], kind: OperatorKind): ParseSuccess => {
{ start: Infinity, end: -Infinity }
)
const createNodes = (nodes: Node[]): ParseSuccess => ({ type: 'success', nodes })
const createPattern = (
value: string,
kind: PatternKind,
quoted: boolean,
negated: boolean,
range: CharacterRange
): ParseSuccess => createNodes([{ type: 'pattern', kind, value, quoted, negated, range }])
const createSequence = (nodes: Node[]): ParseSuccess => {
switch (nodes.length) {
case 0:
case 1:
return createNodes(nodes)
default:
return createNodes([{ type: 'sequence', nodes, range: rangeFromNodes(nodes) }])
}
}
const createParameter = (field: string, value: string, negated: boolean, range: CharacterRange): ParseSuccess =>
createNodes([{ type: 'parameter', field, value, negated, range }])
const createOperator = (nodes: Node[], kind: OperatorKind, rangeStart?: number): ParseSuccess => {
const range = rangeFromNodes(nodes)
if (rangeStart !== undefined) {
range.start = rangeStart
}
return createNodes([{ type: 'operator', operands: nodes, kind, range }])
}
@ -123,7 +152,52 @@ export const parseParenthesis = (tokens: Token[]): State => {
return { result: groupNodes.result, tokens }
}
export const parseLeaves = (tokens: Token[]): State => {
const parseNot = (tokens: Token[]): State => {
const keyword = tokens[0]
if (!(keyword.type === 'keyword' && keyword.kind === KeywordKind.Not)) {
throw new Error('parseNot is called at an invalid token position')
}
tokens = tokens.slice(1) // consume NOT
let nodes: Node[] = []
const operand = tokens[0]
if (!operand) {
return { result: createOperator(nodes, OperatorKind.Not), tokens }
}
switch (operand.type) {
case 'openingParen': {
const state = parseParenthesis(tokens)
if (state.result.type === 'error') {
return { result: state.result, tokens }
}
nodes = state.result.nodes
tokens = state.tokens
break
}
default: {
const node = tokenToLeafNode(operand)
if (node.type === 'error') {
return { result: node, tokens }
}
nodes = node.nodes
tokens = tokens.slice(1)
}
}
return { result: createOperator(nodes, OperatorKind.Not, keyword.range.start), tokens }
}
/**
* parseSequence parses consecutive tokens. If the sequence has a size smaller
* than 2 the nodes are returned directly. That means there will never be a
* sequence of size 1.
* This also takes care of parsing the NOT keyword
*/
const parseSequence = (tokens: Token[]): State => {
const nodes: Node[] = []
while (true) {
const current = tokens[0]
@ -131,13 +205,30 @@ export const parseLeaves = (tokens: Token[]): State => {
break
}
if (current.type === 'openingParen') {
return parseParenthesis(tokens)
const state = parseParenthesis(tokens)
if (state.result.type === 'error') {
return { result: state.result, tokens: state.tokens }
}
nodes.push(...state.result.nodes)
tokens = state.tokens
continue
}
if (current.type === 'closingParen') {
break
}
if (current.type === 'keyword' && (current.kind === KeywordKind.And || current.kind === KeywordKind.Or)) {
return { result: createNodes(nodes), tokens } // Caller advances.
if (current.type === 'keyword') {
if (current.kind === KeywordKind.And || current.kind === KeywordKind.Or) {
break // Caller advances.
}
if (current.kind === KeywordKind.Not) {
const state = parseNot(tokens)
if (state.result.type === 'error') {
return { result: state.result, tokens: state.tokens }
}
nodes.push(...state.result.nodes)
tokens = state.tokens
continue
}
}
const node = tokenToLeafNode(current)
@ -147,7 +238,7 @@ export const parseLeaves = (tokens: Token[]): State => {
nodes.push(...node.nodes)
tokens = tokens.slice(1)
}
return { result: createNodes(nodes), tokens }
return { result: createSequence(nodes), tokens }
}
/**
@ -155,7 +246,7 @@ export const parseLeaves = (tokens: Token[]): State => {
* (a and b or c) => ((a and b) or c).
*/
export const parseAnd = (tokens: Token[]): State => {
const left = parseLeaves(tokens)
const left = parseSequence(tokens)
if (left.result.type === 'error') {
return { result: left.result, tokens }
}

View File

@ -13,20 +13,23 @@ expect.addSnapshotSerializer({
* @param input The input query string.
*/
const collect = (input: string): Node[] => {
const nodes: Node[] = []
const visitedNodes: Node[] = []
const visitors: Visitors = {
visitOperator(operands: Node[], kind: OperatorKind, range, groupRange) {
nodes.push({ type: 'operator', operands, kind, range, groupRange })
visitedNodes.push({ type: 'operator', operands, kind, range, groupRange })
},
visitSequence(nodes: Node[], range) {
visitedNodes.push({ type: 'sequence', nodes, range })
},
visitParameter(field, value, negated, range) {
nodes.push({ type: 'parameter', field, value, negated, range })
visitedNodes.push({ type: 'parameter', field, value, negated, range })
},
visitPattern(value, kind, negated, quoted, range) {
nodes.push({ type: 'pattern', value, kind, negated, quoted, range })
visitedNodes.push({ type: 'pattern', value, kind, negated, quoted, range })
},
}
visit((parseSearchQuery(input) as ParseSuccess).nodes, visitors)
return nodes
return visitedNodes
}
describe('visit()', () => {
@ -36,6 +39,56 @@ describe('visit()', () => {
{
"type": "operator",
"operands": [
{
"type": "sequence",
"nodes": [
{
"type": "parameter",
"field": "repo",
"value": "foo",
"negated": false,
"range": {
"start": 0,
"end": 8
}
},
{
"type": "pattern",
"kind": 1,
"value": "pattern-bar",
"quoted": false,
"negated": false,
"range": {
"start": 9,
"end": 20
}
}
],
"range": {
"start": 0,
"end": 20
}
},
{
"type": "parameter",
"field": "file",
"value": "baz",
"negated": false,
"range": {
"start": 24,
"end": 32
}
}
],
"kind": "OR",
"range": {
"start": 0,
"end": 32
}
},
{
"type": "sequence",
"nodes": [
{
"type": "parameter",
"field": "repo",
@ -56,22 +109,11 @@ describe('visit()', () => {
"start": 9,
"end": 20
}
},
{
"type": "parameter",
"field": "file",
"value": "baz",
"negated": false,
"range": {
"start": 24,
"end": 32
}
}
],
"kind": "OR",
"range": {
"start": 0,
"end": 32
"end": 20
}
},
{

View File

@ -1,4 +1,4 @@
import { Node, Operator, Parameter, Pattern, OperatorKind } from './parser'
import { Node, Operator, Parameter, Pattern, OperatorKind, Sequence } from './parser'
import { CharacterRange, PatternKind } from './token'
export class Visitor {
@ -20,6 +20,9 @@ export class Visitor {
case 'operator':
this.visitOperator(node)
break
case 'sequence':
this.visitSequence(node)
break
case 'parameter':
this.visitParameter(node)
break
@ -37,6 +40,13 @@ export class Visitor {
this.visit(node.operands)
}
private visitSequence(node: Sequence): void {
if (this._visitors.visitSequence) {
this._visitors.visitSequence(node.nodes, node.range)
}
this.visit(node.nodes)
}
private visitParameter(node: Parameter): void {
if (this._visitors.visitParameter) {
this._visitors.visitParameter(node.field, node.value, node.negated, node.range)
@ -52,6 +62,7 @@ export class Visitor {
export interface Visitors {
visitOperator?(operands: Node[], kind: OperatorKind, range: CharacterRange, groupRange?: CharacterRange): void
visitSequence?(nodes: Node[], range: CharacterRange): void
visitParameter?(field: string, value: string, negated: boolean, range: CharacterRange): void
visitPattern?(value: string, kind: PatternKind, negated: boolean, quoted: boolean, range: CharacterRange): void
}