What would be the best way in Python to parse out chunks of text contained in matching brackets?
\"{ { a } { b } { { { c } } } }\"
should i
You could also parse them all at once, though I find the {a}
to mean "a"
rather than ["a"]
slightly weird. If I've understood the format correctly:
import re
import sys
_mbrack_rb = re.compile("([^{}]*)}") # re.match doesn't have a pos parameter
def mbrack(s):
"""Parse matching brackets.
>>> mbrack("{a}")
'a'
>>> mbrack("{{a}{b}}")
['a', 'b']
>>> mbrack("{{a}{b}{{{c}}}}")
['a', 'b', [['c']]]
>>> mbrack("a")
Traceback (most recent call last):
ValueError: expected left bracket
>>> mbrack("{a}{b}")
Traceback (most recent call last):
ValueError: more than one root
>>> mbrack("{a")
Traceback (most recent call last):
ValueError: expected value then right bracket
>>> mbrack("{a{}}")
Traceback (most recent call last):
ValueError: expected value then right bracket
>>> mbrack("{a}}")
Traceback (most recent call last):
ValueError: unbalanced brackets (found right bracket)
>>> mbrack("{{a}")
Traceback (most recent call last):
ValueError: unbalanced brackets (not enough right brackets)
"""
stack = [[]]
i, end = 0, len(s)
while i < end:
if s[i] != "{":
raise ValueError("expected left bracket")
elif i != 0 and len(stack) == 1:
raise ValueError("more than one root")
while i < end and s[i] == "{":
L = []
stack[-1].append(L)
stack.append(L)
i += 1
stack.pop()
stack[-1].pop()
m = _mbrack_rb.match(s, i)
if m is None:
raise ValueError("expected value then right bracket")
stack[-1].append(m.group(1))
i = m.end(0)
while i < end and s[i] == "}":
if len(stack) == 1:
raise ValueError("unbalanced brackets (found right bracket)")
stack.pop()
i += 1
if len(stack) != 1:
raise ValueError("unbalanced brackets (not enough right brackets)")
return stack[0][0]
def main(args):
if args:
print >>sys.stderr, "unexpected arguments: %r" % args
import doctest
r = doctest.testmod()
print r
return r[0]
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))
If you want to use a parser (lepl in this case), but still want the intermediate results rather than a final parsed list, then I think this is the kind of thing you were looking for:
>>> nested = Delayed()
>>> nested += "{" + (nested[1:,...]|Any()) + "}"
>>> split = (Drop("{") & (nested[:,...]|Any()) & Drop("}"))[:].parse
>>> split("{{a}{b}{{{c}}}}")
['{a}{b}{{{c}}}']
>>> split("{a}{b}{{{c}}}")
['a', 'b', '{{c}}']
>>> split("{{c}}")
['{c}']
>>> split("{c}")
['c']
That might look opaque at first, but it's fairly simple really :o)
nested is a recursive definition of a matcher for nested brackets (the "+" and [...] in the definition keep everything as a single string after it has been matched). Then split says match as many as possible ("[:]") of something that is surrounded by "{" ... "}" (which we discard with "Drop") and contains either a nested expression or any letter.
Finally, here's a lepl version of the "all in one" parser that gives a result in the same format as the pyparsing example above, but which (I believe) is more flexible about how spaces appear in the input:
>>> with Separator(~Space()[:]):
... nested = Delayed()
... nested += Drop("{") & (nested[1:] | Any()) & Drop("}") > list
...
>>> nested.parse("{{ a }{ b}{{{c}}}}")
[[['a'], ['b'], [[['c']]]]]
Here is a solution I came up with for a similar use case. This was loosely based on the accepted psuedo code answer. I didn't want to add any dependencies for external libraries:
def parse_segments(source, recurse=False):
"""
extract any substring enclosed in parenthesis
source should be a string
"""
unmatched_count = 0
start_pos = 0
opened = False
open_pos = 0
cur_pos = 0
finished = []
segments = []
for character in source:
#scan for mismatched parenthesis:
if character == '(':
unmatched_count += 1
if not opened:
open_pos = cur_pos
opened = True
if character == ')':
unmatched_count -= 1
if opened and unmatched_count == 0:
segment = source[open_pos:cur_pos+1]
segments.append(segment)
clean = source[start_pos:open_pos]
if clean:
finished.append(clean)
opened = False
start_pos = cur_pos+1
cur_pos += 1
assert unmatched_count == 0
if start_pos != cur_pos:
#get anything that was left over here
finished.append(source[start_pos:cur_pos])
#now check on recursion:
for item in segments:
#get rid of bounding parentheses:
pruned = item[1:-1]
if recurse:
results = parse_tags(pruned, recurse)
finished.expand(results)
else:
finished.append(pruned)
return finished