feat(glob): new Glob implementation based on Peglob #33605

|vim.glob.to_lpeg()| uses a new LPeg-based implementation (Peglob) that
provides ~50% speedup for complex patterns. The implementation restores
support for nested braces and follows LSP 3.17 specification with
additional constraints for improved correctness and resistance to
backtracking edge cases.
This commit is contained in:
Brynne Taylor
2025-05-22 15:24:49 +08:00
committed by GitHub
parent 172a90c245
commit 322a6d305d
4 changed files with 464 additions and 129 deletions

View File

@@ -18,6 +18,7 @@ describe('glob', function()
eq(true, match('', ''))
eq(false, match('', 'a'))
eq(true, match('a', 'a'))
eq(true, match('.', '.'))
eq(true, match('/', '/'))
eq(true, match('abc', 'abc'))
eq(false, match('abc', 'abcdef'))
@@ -35,7 +36,8 @@ describe('glob', function()
end)
it('should match * wildcards', function()
eq(false, match('*', ''))
eq(true, match('*', ''))
eq(true, match('*', ' '))
eq(true, match('*', 'a'))
eq(false, match('*', '/'))
eq(false, match('*', '/a'))
@@ -43,6 +45,7 @@ describe('glob', function()
eq(true, match('*', 'aaa'))
eq(true, match('*a', 'aa'))
eq(true, match('*a', 'abca'))
eq(true, match('*.ts', '.ts'))
eq(true, match('*.txt', 'file.txt'))
eq(false, match('*.txt', 'file.txtxt'))
eq(false, match('*.txt', 'dir/file.txt'))
@@ -62,18 +65,13 @@ describe('glob', function()
eq(false, match('dir/*/file.txt', 'dir/file.txt'))
eq(true, match('dir/*/file.txt', 'dir/subdir/file.txt'))
eq(false, match('dir/*/file.txt', 'dir/subdir/subdir/file.txt'))
-- The spec does not describe this, but VSCode only interprets ** when it's by
-- itself in a path segment, and otherwise interprets ** as consecutive * directives.
-- see: https://github.com/microsoft/vscode/blob/eef30e7165e19b33daa1e15e92fa34ff4a5df0d3/src/vs/base/common/glob.ts#L112
eq(true, match('a**', 'abc')) -- '**' should parse as two '*'s when not by itself in a path segment
eq(true, match('**c', 'abc'))
eq(false, match('a**', 'ab')) -- each '*' should still represent at least one character
eq(false, match('**c', 'bc'))
eq(true, match('a**', 'abcd'))
eq(true, match('**d', 'abcd'))
eq(false, match('a**', 'abc/d'))
eq(false, match('**d', 'abc/d'))
eq(true, match('a*b*c*d*e*', 'axbxcxdxe'))
eq(true, match('a*b*c*d*e*', 'axbxcxdxexxx'))
eq(true, match('a*b?c*x', 'abxbbxdbxebxczzx'))
eq(false, match('a*b?c*x', 'abxbbxdbxebxczzy'))
eq(true, match('a*b*[cy]*d*e*', 'axbxcxdxexxx'))
eq(true, match('a*b*[cy]*d*e*', 'axbxyxdxexxx'))
eq(true, match('a*b*[cy]*d*e*', 'axbxxxyxdxexxx'))
end)
it('should match ? wildcards', function()
@@ -84,6 +82,11 @@ describe('glob', function()
eq(true, match('??', 'ab'))
eq(true, match('a?c', 'abc'))
eq(false, match('a?c', 'a/c'))
eq(false, match('a/', 'a/.b'))
eq(true, match('?/?', 'a/b'))
eq(true, match('/??', '/ab'))
eq(true, match('/?b', '/ab'))
eq(false, match('foo?bar', 'foo/bar'))
end)
it('should match ** wildcards', function()
@@ -99,7 +102,7 @@ describe('glob', function()
eq(true, match('/**', '/'))
eq(true, match('/**', '/a/b/c'))
eq(true, match('**/', '')) -- **/ absorbs trailing /
eq(true, match('**/', '/a/b/c'))
eq(false, match('**/', '/a/b/c'))
eq(true, match('**/**', ''))
eq(true, match('**/**', 'a'))
eq(false, match('a/**', ''))
@@ -134,20 +137,9 @@ describe('glob', function()
end)
it('should match {} groups', function()
eq(true, match('{}', ''))
eq(false, match('{}', 'a'))
eq(true, match('a{}', 'a'))
eq(true, match('{}a', 'a'))
eq(true, match('{,}', ''))
eq(true, match('{a,}', ''))
eq(true, match('{a,}', 'a'))
eq(true, match('{a}', 'a'))
eq(false, match('{a}', 'aa'))
eq(false, match('{a}', 'ab'))
eq(true, match('{a?c}', 'abc'))
eq(false, match('{ab}', 'a'))
eq(false, match('{ab}', 'b'))
eq(true, match('{ab}', 'ab'))
eq(true, match('{a,b}', 'a'))
eq(true, match('{a,b}', 'b'))
eq(false, match('{a,b}', 'ab'))
@@ -155,7 +147,22 @@ describe('glob', function()
eq(false, match('{ab,cd}', 'a'))
eq(true, match('{ab,cd}', 'cd'))
eq(true, match('{a,b,c}', 'c'))
eq(false, match('{a,{b,c}}', 'c')) -- {} cannot nest
eq(true, match('{a,{b,c}}', 'c'))
eq(true, match('a{,/}*.txt', 'a.txt'))
eq(true, match('a{,/}*.txt', 'ab.txt'))
eq(true, match('a{,/}*.txt', 'a/b.txt'))
eq(true, match('a{,/}*.txt', 'a/ab.txt'))
eq(true, match('a/{a{a,b},b}', 'a/aa'))
eq(true, match('a/{a{a,b},b}', 'a/ab'))
eq(false, match('a/{a{a,b},b}', 'a/ac'))
eq(true, match('a/{a{a,b},b}', 'a/b'))
eq(false, match('a/{a{a,b},b}', 'a/c'))
eq(true, match('foo{bar,b*z}', 'foobar'))
eq(true, match('foo{bar,b*z}', 'foobuzz'))
eq(true, match('foo{bar,b*z}', 'foobarz'))
eq(true, match('{a,b}/c/{d,e}/**/*est.ts', 'a/c/d/one/two/three.test.ts'))
eq(true, match('{a,{d,e}b}/c', 'a/c'))
eq(true, match('{**/a,**/b}', 'b'))
end)
it('should match [] groups', function()
@@ -181,6 +188,13 @@ describe('glob', function()
eq(true, match('[a-zA-Z0-9]', 'Z'))
eq(true, match('[a-zA-Z0-9]', '9'))
eq(false, match('[a-zA-Z0-9]', '&'))
eq(true, match('[?]', '?'))
eq(false, match('[?]', 'a'))
eq(true, match('[*]', '*'))
eq(false, match('[*]', 'a'))
eq(true, match('[\\!]', '!'))
eq(true, match('a\\*b', 'a*b'))
eq(false, match('a\\*b', 'axb'))
end)
it('should match [!...] groups', function()
@@ -202,8 +216,7 @@ describe('glob', function()
it('should handle long patterns', function()
-- lpeg has a recursion limit of 200 by default, make sure the grammar does trigger it on
-- strings longer than that
local fill_200 =
'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'
local fill_200 = ('a'):rep(200)
eq(200, fill_200:len())
local long_lit = fill_200 .. 'a'
eq(false, match(long_lit, 'b'))
@@ -212,6 +225,21 @@ describe('glob', function()
eq(true, match(long_pat, fill_200 .. 'a/b/c/d.c'))
end)
-- New test for unicode patterns from assets
it('should match unicode patterns', function()
eq(true, match('😎/¢£.{ts,tsx,js,jsx}', '😎/¢£.ts'))
eq(true, match('😎/¢£.{ts,tsx,js,jsx}', '😎/¢£.tsx'))
eq(true, match('😎/¢£.{ts,tsx,js,jsx}', '😎/¢£.js'))
eq(true, match('😎/¢£.{ts,tsx,js,jsx}', '😎/¢£.jsx'))
eq(false, match('😎/¢£.{ts,tsx,js,jsx}', '😎/¢£.jsxxxxxxxx'))
eq(true, match('*é*', 'café noir'))
eq(true, match('caf*noir', 'café noir'))
eq(true, match('caf*noir', 'cafeenoir'))
eq(true, match('F[ë£a]', ''))
eq(true, match('F[ë£a]', ''))
eq(true, match('F[ë£a]', 'Fa'))
end)
it('should match complex patterns', function()
eq(false, match('**/*.{c,h}', ''))
eq(false, match('**/*.{c,h}', 'c'))