Given an NLP parse tree like
(ROOT (S (NP (PRP You)) (VP (MD could) (VP (VB say) (SBAR (IN that) (S (NP (PRP they)) (ADVP (RB regularly)) (VP (VB catch) (NP
You can use Tree.subtrees()
. For more information check NLTK Tree Class.
from nltk import Tree
parse_str = "(ROOT (S (NP (PRP You)) (VP (MD could) (VP (VB say) (SBAR (IN that) (S (NP (PRP they)) (ADVP (RB regularly)) (VP (VB catch) (NP (NP (DT a) (NN shower)) (, ,) (SBAR (WHNP (WDT which)) (S (VP (VBZ adds) (PP (TO to) (NP (NP (PRP$ their) (NN exhilaration)) (CC and) (NP (FW joie) (FW de) (FW vivre))))))))))))) (. .)))"
#parse_str = "(ROOT (S (SBAR (IN Though) (S (NP (PRP he)) (VP (VBD was) (ADJP (RB very) (JJ rich))))) (, ,) (NP (PRP he)) (VP (VBD was) (ADVP (RB still)) (ADJP (RB very) (JJ unhappy))) (. .)))"
t = Tree.fromstring(parse_str)
#print t
subtexts = []
for subtree in t.subtrees():
if subtree.label()=="S" or subtree.label()=="SBAR":
#print subtree.leaves()
subtexts.append(' '.join(subtree.leaves()))
#print subtexts
presubtexts = subtexts[:] # ADDED IN EDIT for leftover check
for i in reversed(range(len(subtexts)-1)):
subtexts[i] = subtexts[i][0:subtexts[i].index(subtexts[i+1])]
for text in subtexts:
print text
# ADDED IN EDIT - Not sure for generalized cases
leftover = presubtexts[0][presubtexts[0].index(presubtexts[1])+len(presubtexts[1]):]
print leftover
You could say
they regularly catch a shower ,
adds to their exhilaration and joie de vivre
First get parse tree:
# stanza.install_corenlp()
from stanza.server import CoreNLPClient
text = "Joe realized that the train was late while he waited at the train station"
with CoreNLPClient(
annotators=['tokenize', 'pos', 'lemma', 'parse', 'depparse'],
memory='16G') as client:
output = client.annotate(text)
# print(output.sentence[0])
parse_tree = output['sentences'][0]['parse']
parse_tree = ' '.join(parse_tree.split())
Then use this gist to extract clauses by calling:
The output will be:
{'the train was late', 'he waited at the train station', 'Joe realized'}