Stem, Lemmatize words with frequency (filter&analyze)

问题

Okay. I am trying to add a word_tag, But I am not sure if this is the right way. (sorry I am a newbie)

from nltk.corpus import wordnet as wn

# Count the words
        word_count = Counter(words)

        # Clean the content a little
        filter_words = ['artists']
        for word in filter_words:
            if word in word_count:
                del word_count[word]

        # POS_TAG the words
        word_tag = nltk.corpus.wn.synsets(word_count)

        # And the survey says...
        print("The Top {0} words".format(n))
        for word, count, word_tag in word_count.most_common(n) and nltk.corpus.wordnet.synsets(n):
            print("{0}: {1, 2}".format(word, count, word_tag))

I'd like to make a DB table with stemmed/lemmatized words' frequency and tagged Part-of-speech Tagging(VERB,NOUN,ADV,..) like this.. http://www.nltk.org/book/ch05.html#tab-universal-tagset

How can I solve the error? on mySQL db, # | word | POS Tag. | Frequency I also looking for a way to drop words that is not on the dictionary (artistessex, asifyou) since I parse words using len...

    ##
import re
import MySQLdb as mdb
import xml.etree.ElementTree as ET    
import requests, re
from xml.etree import ElementTree
from collections import Counter
from lxml import html
import nltk
from nltk.corpus import wordnet
from nltk import word_tokenize, sent_tokenize, pos_tag
from nltk.corpus import wordnet as wn
from nltk.stem import PorterStemmer, WordNetLemmatizer

##    

def is_noun(tag):
        return tag in ['NN', 'NNS', 'NNP', 'NNPS']


    def is_verb(tag):
        return tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']


    def is_adverb(tag):
        return tag in ['RB', 'RBR', 'RBS']


    def is_adjective(tag):
        return tag in ['JJ', 'JJR', 'JJS']


    def penn_to_wn(tag):
        if is_adjective(tag):
            return wn.ADJ
        elif is_noun(tag):
            return wn.NOUN
        elif is_adverb(tag):
            return wn.ADV
        elif is_verb(tag):
            return wn.VERB
        return None


    stemmer = PorterStemmer()
    lemmatiser = WordNetLemmatizer()



    ## XML PARSING
    def main(n=10):

        # A list of feeds to process and their xpath


        feeds = [
            {'url': 'http://www.nyartbeat.com/list/event_type_print_painting.en.xml', 'xpath': './/Description'},
            {'url': 'http://feeds.feedburner.com/FriezeMagazineUniversal?format=xml', 'xpath': './/description'}
        ]



        # A place to hold all feed results
        results = []

        # Loop all the feeds
        for feed in feeds:
            # Append feed results together
            results = results + process(feed['url'], feed['xpath'])

        # Join all results into a big string
        contents=",".join(map(str, results))

        # Remove double+ spaces
        contents = re.sub('\s+', ' ', contents)

        # Remove everything that is not a character or whitespace
        contents = re.sub('[^A-Za-z ]+', '', contents)

        # Create a list of lower case words that are at least 8 characters
        words=[w.lower() for w in contents.split() if len(w) >=8 ]


        # Count the words
        word_count = Counter(words)

        # POS_TAG the words

        word_stem = stemmer.stem(words)
        word_refine = lemmatiser.lemmatize(word_stem)
    #    tokens = word_tokenize(words) # Generate list of tokens
    #    tokens_pos = pos_tag(tokens)


        # Clean the content a little
        filter_words = ['artists']
        for word in filter_words:
            if word in word_refine:
                del word_refine[word]


        # And the survey says...

        print("The Top {0} words".format(n))
        for word, pos in word_refine.stemmer.stem(n):

            for word, count in word_count.most_common(n):
                print("{0}: {1, 2}".format(word, pos, count))



    def process(url, xpath):
        """
        Downloads a feed url and extracts the results with a variable path
        :param url: string
        :param xpath: string
        :return: list
        """
        contents = requests.get(url)
        root = ElementTree.fromstring(contents.content)
        return [element.text.encode('utf8') if element.text is not None else '' for element in root.findall(xpath)]


    # Add to DB
        for word, count in word_count.most_common(n):

                sql = """INSERT INTO Table1 (keyword, pos, freq) VALUES(%s, %s, %s)"""
                cursor.execute(sql, (word, pos, count))
                db.commit()


    if __name__ == "__main__":
        main()

来源：https://stackoverflow.com/questions/33727269/stem-lemmatize-words-with-frequency-filteranalyze

标签

python

mysql

xml

nltk

word