I want to be able to match a pattern in glob format to a list of strings, rather than to actual files in the filesystem. Is there any way to do this, or convert a glob
Can't say how efficient it is, but it is much less verbose, much less complicated, more complete, and possibly more secure/reliable than other solutions.
Supported syntax:
*
-- matches zero or more characters.**
(actually, it's either **/
or /**
) -- matches zero or more subdirectories.?
-- matches one character.[]
-- matches one character within brackets.[!]
-- matches one character not within brackets.\
, only /
can be used as a path separator.Order of operation:
import re
from sys import hexversion, implementation
# Support for insertion-preserving/ordered dicts became language feature in Python 3.7, but works in CPython since 3.6.
if hexversion >= 0x03070000 or (implementation.name == 'cpython' and hexversion >= 0x03060000):
ordered_dict = dict
else:
from collections import OrderedDict as ordered_dict
escaped_glob_tokens_to_re = ordered_dict((
# Order of ``**/`` and ``/**`` in RE tokenization pattern doesn't matter because ``**/`` will be caught first no matter what, making ``/**`` the only option later on.
# W/o leading or trailing ``/`` two consecutive asterisks will be treated as literals.
('/\*\*', '(?:/.+?)*'), # Edge-case #1. Catches recursive globs in the middle of path. Requires edge case #2 handled after this case.
('\*\*/', '(?:^.+?/)*'), # Edge-case #2. Catches recursive globs at the start of path. Requires edge case #1 handled before this case. ``^`` is used to ensure proper location for ``**/``.
('\*', '[^/]*?'), # ``[^/]*?`` is used to ensure that ``*`` won't match subdirs, as with naive ``.*?`` solution.
('\?', '.'),
('\[\*\]', '\*'), # Escaped special glob character.
('\[\?\]', '\?'), # Escaped special glob character.
('\[!', '[^'), # Requires ordered dict, so that ``\[!`` preceded ``\[`` in RE pattern. Needed mostly to differentiate between ``!`` used within character class ``[]`` and outside of it, to avoid faulty conversion.
('\[', '['),
('\]', ']'),
))
escaped_glob_replacement = re.compile('(%s)' % '|'.join(escaped_glob_tokens_to_re).replace('\\', '\\\\\\'))
def glob_to_re(pattern):
return escaped_glob_replacement.sub(lambda match: escaped_glob_tokens_to_re[match.group(0)], re.escape(pattern))
if __name__ == '__main__':
validity_paths_globs = (
(True, 'foo.py', 'foo.py'),
(True, 'foo.py', 'fo[o].py'),
(True, 'fob.py', 'fo[!o].py'),
(True, '*foo.py', '[*]foo.py'),
(True, 'foo.py', '**/foo.py'),
(True, 'baz/duck/bar/bam/quack/foo.py', '**/bar/**/foo.py'),
(True, 'bar/foo.py', '**/foo.py'),
(True, 'bar/baz/foo.py', 'bar/**'),
(False, 'bar/baz/foo.py', 'bar/*'),
(False, 'bar/baz/foo.py', 'bar**/foo.py'),
(True, 'bar/baz/foo.py', 'bar/**/foo.py'),
(True, 'bar/baz/wut/foo.py', 'bar/**/foo.py'),
)
results = []
for seg in validity_paths_globs:
valid, path, glob_pat = seg
print('valid:', valid)
print('path:', path)
print('glob pattern:', glob_pat)
re_pat = glob_to_re(glob_pat)
print('RE pattern:', re_pat)
match = re.fullmatch(re_pat, path)
print('match:', match)
result = bool(match) == valid
results.append(result)
print('result was expected:', result)
print('-'*79)
print('all results were expected:', all(results))
print('='*79)