How to extract subjects in a sentence and their respective dependent phrases?

后端 未结 2 1644
逝去的感伤
逝去的感伤 2021-01-30 06:06

I am trying to work on subject extraction in a sentence, so that I can get the sentiments in accordance with the subject. I am using nltk in python2.7 for this purp

2条回答
  •  挽巷
    挽巷 (楼主)
    2021-01-30 06:34

    I was going through spacy library more, and I finally figured out the solution through dependency management. Thanks to this repo, I figured out how to include adjectives as well in my subjective verb object (making it SVAO's), as well as taking out compound subjects in the query. Here goes my solution:

    from nltk.stem.wordnet import WordNetLemmatizer
    from spacy.lang.en import English
    
    SUBJECTS = ["nsubj", "nsubjpass", "csubj", "csubjpass", "agent", "expl"]
    OBJECTS = ["dobj", "dative", "attr", "oprd"]
    ADJECTIVES = ["acomp", "advcl", "advmod", "amod", "appos", "nn", "nmod", "ccomp", "complm",
                  "hmod", "infmod", "xcomp", "rcmod", "poss"," possessive"]
    COMPOUNDS = ["compound"]
    PREPOSITIONS = ["prep"]
    
    def getSubsFromConjunctions(subs):
        moreSubs = []
        for sub in subs:
            # rights is a generator
            rights = list(sub.rights)
            rightDeps = {tok.lower_ for tok in rights}
            if "and" in rightDeps:
                moreSubs.extend([tok for tok in rights if tok.dep_ in SUBJECTS or tok.pos_ == "NOUN"])
                if len(moreSubs) > 0:
                    moreSubs.extend(getSubsFromConjunctions(moreSubs))
        return moreSubs
    
    def getObjsFromConjunctions(objs):
        moreObjs = []
        for obj in objs:
            # rights is a generator
            rights = list(obj.rights)
            rightDeps = {tok.lower_ for tok in rights}
            if "and" in rightDeps:
                moreObjs.extend([tok for tok in rights if tok.dep_ in OBJECTS or tok.pos_ == "NOUN"])
                if len(moreObjs) > 0:
                    moreObjs.extend(getObjsFromConjunctions(moreObjs))
        return moreObjs
    
    def getVerbsFromConjunctions(verbs):
        moreVerbs = []
        for verb in verbs:
            rightDeps = {tok.lower_ for tok in verb.rights}
            if "and" in rightDeps:
                moreVerbs.extend([tok for tok in verb.rights if tok.pos_ == "VERB"])
                if len(moreVerbs) > 0:
                    moreVerbs.extend(getVerbsFromConjunctions(moreVerbs))
        return moreVerbs
    
    def findSubs(tok):
        head = tok.head
        while head.pos_ != "VERB" and head.pos_ != "NOUN" and head.head != head:
            head = head.head
        if head.pos_ == "VERB":
            subs = [tok for tok in head.lefts if tok.dep_ == "SUB"]
            if len(subs) > 0:
                verbNegated = isNegated(head)
                subs.extend(getSubsFromConjunctions(subs))
                return subs, verbNegated
            elif head.head != head:
                return findSubs(head)
        elif head.pos_ == "NOUN":
            return [head], isNegated(tok)
        return [], False
    
    def isNegated(tok):
        negations = {"no", "not", "n't", "never", "none"}
        for dep in list(tok.lefts) + list(tok.rights):
            if dep.lower_ in negations:
                return True
        return False
    
    def findSVs(tokens):
        svs = []
        verbs = [tok for tok in tokens if tok.pos_ == "VERB"]
        for v in verbs:
            subs, verbNegated = getAllSubs(v)
            if len(subs) > 0:
                for sub in subs:
                    svs.append((sub.orth_, "!" + v.orth_ if verbNegated else v.orth_))
        return svs
    
    def getObjsFromPrepositions(deps):
        objs = []
        for dep in deps:
            if dep.pos_ == "ADP" and dep.dep_ == "prep":
                objs.extend([tok for tok in dep.rights if tok.dep_  in OBJECTS or (tok.pos_ == "PRON" and tok.lower_ == "me")])
        return objs
    
    def getAdjectives(toks):
        toks_with_adjectives = []
        for tok in toks:
            adjs = [left for left in tok.lefts if left.dep_ in ADJECTIVES]
            adjs.append(tok)
            adjs.extend([right for right in tok.rights if tok.dep_ in ADJECTIVES])
            tok_with_adj = " ".join([adj.lower_ for adj in adjs])
            toks_with_adjectives.extend(adjs)
    
        return toks_with_adjectives
    
    def getObjsFromAttrs(deps):
        for dep in deps:
            if dep.pos_ == "NOUN" and dep.dep_ == "attr":
                verbs = [tok for tok in dep.rights if tok.pos_ == "VERB"]
                if len(verbs) > 0:
                    for v in verbs:
                        rights = list(v.rights)
                        objs = [tok for tok in rights if tok.dep_ in OBJECTS]
                        objs.extend(getObjsFromPrepositions(rights))
                        if len(objs) > 0:
                            return v, objs
        return None, None
    
    def getObjFromXComp(deps):
        for dep in deps:
            if dep.pos_ == "VERB" and dep.dep_ == "xcomp":
                v = dep
                rights = list(v.rights)
                objs = [tok for tok in rights if tok.dep_ in OBJECTS]
                objs.extend(getObjsFromPrepositions(rights))
                if len(objs) > 0:
                    return v, objs
        return None, None
    
    def getAllSubs(v):
        verbNegated = isNegated(v)
        subs = [tok for tok in v.lefts if tok.dep_ in SUBJECTS and tok.pos_ != "DET"]
        if len(subs) > 0:
            subs.extend(getSubsFromConjunctions(subs))
        else:
            foundSubs, verbNegated = findSubs(v)
            subs.extend(foundSubs)
        return subs, verbNegated
    
    def getAllObjs(v):
        # rights is a generator
        rights = list(v.rights)
        objs = [tok for tok in rights if tok.dep_ in OBJECTS]
        objs.extend(getObjsFromPrepositions(rights))
    
        potentialNewVerb, potentialNewObjs = getObjFromXComp(rights)
        if potentialNewVerb is not None and potentialNewObjs is not None and len(potentialNewObjs) > 0:
            objs.extend(potentialNewObjs)
            v = potentialNewVerb
        if len(objs) > 0:
            objs.extend(getObjsFromConjunctions(objs))
        return v, objs
    
    def getAllObjsWithAdjectives(v):
        # rights is a generator
        rights = list(v.rights)
        objs = [tok for tok in rights if tok.dep_ in OBJECTS]
    
        if len(objs)== 0:
            objs = [tok for tok in rights if tok.dep_ in ADJECTIVES]
    
        objs.extend(getObjsFromPrepositions(rights))
    
        potentialNewVerb, potentialNewObjs = getObjFromXComp(rights)
        if potentialNewVerb is not None and potentialNewObjs is not None and len(potentialNewObjs) > 0:
            objs.extend(potentialNewObjs)
            v = potentialNewVerb
        if len(objs) > 0:
            objs.extend(getObjsFromConjunctions(objs))
        return v, objs
    
    def findSVOs(tokens):
        svos = []
        verbs = [tok for tok in tokens if tok.pos_ == "VERB" and tok.dep_ != "aux"]
        for v in verbs:
            subs, verbNegated = getAllSubs(v)
            # hopefully there are subs, if not, don't examine this verb any longer
            if len(subs) > 0:
                v, objs = getAllObjs(v)
                for sub in subs:
                    for obj in objs:
                        objNegated = isNegated(obj)
                        svos.append((sub.lower_, "!" + v.lower_ if verbNegated or objNegated else v.lower_, obj.lower_))
        return svos
    
    def findSVAOs(tokens):
        svos = []
        verbs = [tok for tok in tokens if tok.pos_ == "VERB" and tok.dep_ != "aux"]
        for v in verbs:
            subs, verbNegated = getAllSubs(v)
            # hopefully there are subs, if not, don't examine this verb any longer
            if len(subs) > 0:
                v, objs = getAllObjsWithAdjectives(v)
                for sub in subs:
                    for obj in objs:
                        objNegated = isNegated(obj)
                        obj_desc_tokens = generate_left_right_adjectives(obj)
                        sub_compound = generate_sub_compound(sub)
                        svos.append((" ".join(tok.lower_ for tok in sub_compound), "!" + v.lower_ if verbNegated or objNegated else v.lower_, " ".join(tok.lower_ for tok in obj_desc_tokens)))
        return svos
    
    def generate_sub_compound(sub):
        sub_compunds = []
        for tok in sub.lefts:
            if tok.dep_ in COMPOUNDS:
                sub_compunds.extend(generate_sub_compound(tok))
        sub_compunds.append(sub)
        for tok in sub.rights:
            if tok.dep_ in COMPOUNDS:
                sub_compunds.extend(generate_sub_compound(tok))
        return sub_compunds
    
    def generate_left_right_adjectives(obj):
        obj_desc_tokens = []
        for tok in obj.lefts:
            if tok.dep_ in ADJECTIVES:
                obj_desc_tokens.extend(generate_left_right_adjectives(tok))
        obj_desc_tokens.append(obj)
    
        for tok in obj.rights:
            if tok.dep_ in ADJECTIVES:
                obj_desc_tokens.extend(generate_left_right_adjectives(tok))
    
        return obj_desc_tokens
    

    Now when you pass query such as:

    from spacy.lang.en import English
    parser = English()
    
    sentence = u"""
    Donald Trump is the worst president of USA, but Hillary is better than him
    """
    
    parse = parser(sentence)
    print(findSVAOs(parse))
    

    You will get the following:

    [(u'donald trump', u'is', u'worst president'), (u'hillary', u'is', u'better')]
    

    Thank you @Krzysiek for your solution too, I actually was unable to go deep into your library to modify it. I rather tried modifying the above mentioned link to solve my problem.

提交回复
热议问题