Tokenize words in a list of sentences Python

后端 未结 7 1470
广开言路
广开言路 2021-02-04 06:41

i currently have a file that contains a list that is looks like

example = [\'Mary had a little lamb\' , 
           \'Jack went up the hill\' , 
           \'Ji         


        
相关标签:
7条回答
  • 2021-02-04 07:06

    In Spacy it will be as simple as :

    import spacy
    
    example = ['Mary had a little lamb' , 
               'Jack went up the hill' , 
               'Jill followed suit' ,    
               'i woke up suddenly' ,
               'it was a really bad dream...']
    
    nlp = spacy.load("en_core_web_sm")
    
    result = []
    
    for line in example:
        sent = nlp(line)
        token_result = []
        for token in sent:
            token_result.append(token)
        result.append(token_result)
    
    print(result)
    

    And the output will be :

    [[Mary, had, a, little, lamb], [Jack, went, up, the, hill], [Jill, followed, suit], [i, woke, up, suddenly], [it, was, a, really, bad, dream, ...]]
    
    0 讨论(0)
  • 2021-02-04 07:11

    You can use nltk (as @alvas suggests) and a recursive function which take any object and tokenize each str in:

    from nltk.tokenize import word_tokenize
    def tokenize(obj):
        if obj is None:
            return None
        elif isinstance(obj, str): # basestring in python 2.7
            return word_tokenize(obj)
        elif isinstance(obj, list):
            return [tokenize(i) for i in obj]
        else:
            return obj # Or throw an exception, or parse a dict...
    

    Usage:

    data = [["Lorem ipsum dolor. Sit amet?", "Hello World!", None], ["a"], "Hi!", None, ""]
    print(tokenize(data))
    

    Output:

    [[['Lorem', 'ipsum', 'dolor', '.', 'Sit', 'amet', '?'], ['Hello', 'World', '!'], None], [['a']], ['Hi', '!'], None, []]
    
    0 讨论(0)
  • 2021-02-04 07:12

    For me it's hard to tell, what you are trying to do.

    How about this

    exclude = set(['Mary', 'Jack', 'Jill', 'i', 'it'])
    
    mod_example = []
    for sentence in example:
        words = sentence.split()
        # Optionally sort out some words
        for word in words:
            if word in exclude:
                words.remove(word)
        mod_example.append('\'' + '\' \''.join(words) + '\'')
    
    print mod_example
    

    Which ouputs

    ["'had' 'a' 'little' 'lamb'", "'went' 'up' 'the' 'hill'", "'followed' 'suit'", 
    "'woke' 'up' 'suddenly'", "'was' 'a' 'really' 'bad' 'dream...'"]
    >>> 
    

    Edit: Another suggestion based on further info given by the OP

    example = ['Area1 Area1 street one, 4454 hikoland' ,
               'Area2 street 2, 52432 hikoland, area2' ,
               'Area3 ave three, 0534 hikoland' ]
    
    mod_example = []
    for sentence in example:
        words = sentence.split()
        # Sort out some words
        col1 = words[0]
        col2 = words[1:]
        if col1 in col2:
            col2.remove(col1)
        elif col1.lower() in col2:
            col2.remove(col1.lower())
        mod_example.append(col1 + ': ' + ' '.join(col2))
    

    Outputs

    >>>> print mod_example
    ['Area1: street one, 4454 hikoland', 'Area2: street 2, 52432 hikoland,', 
    'Area3: ave three, 0534 hikoland']
    >>> 
    
    0 讨论(0)
  • 2021-02-04 07:15

    Break down the list "Example"

    first_split = []
    
    for i in example:
    
        first_split.append(i.split())
    

    Break down the elements of first_split list

    second_split = []
    
    for j in first_split:
    
        for k in j:
    
            second_split.append(k.split())
    

    Break down the elements of the second_split list and append it to the final list, how the coder need the output

    final_list = []
    
    for m in second_split:
    
        for n in m:
    
            if(n not in final_list):
    
                final_list.append(n)
    
    print(final_list)   
    
    0 讨论(0)
  • 2021-02-04 07:16

    i make this script to make all people understood how to tokenize, so they can build their Natural Language Processing's engine by them self.

    import re
    from contextlib import redirect_stdout
    from io import StringIO
    
    example = 'Mary had a little lamb, Jack went up the hill, Jill followed suit, i woke up suddenly, it was a really bad dream...'
    
    def token_to_sentence(str):
        f = StringIO()
        with redirect_stdout(f):
            regex_of_sentence = re.findall('([\w\s]{0,})[^\w\s]', str)
            regex_of_sentence = [x for x in regex_of_sentence if x is not '']
            for i in regex_of_sentence:
                print(i)
            first_step_to_sentence = (f.getvalue()).split('\n')
        g = StringIO()
        with redirect_stdout(g):
            for i in first_step_to_sentence:
                try:
                    regex_to_clear_sentence = re.search('\s([\w\s]{0,})', i)
                    print(regex_to_clear_sentence.group(1))
                except:
                    print(i)
            sentence = (g.getvalue()).split('\n')
        return sentence
    
    def token_to_words(str):
        f = StringIO()
        with redirect_stdout(f):
            for i in str:
                regex_of_word = re.findall('([\w]{0,})', i)
                regex_of_word = [x for x in regex_of_word if x is not '']
                for word in regex_of_word:
                    print(regex_of_word)
            words = (f.getvalue()).split('\n')
    

    i make a different process, i restart the process from paragraph, to make everybody more understood of word processing. paragraph to process is:

    example = 'Mary had a little lamb, Jack went up the hill, Jill followed suit, i woke up suddenly, it was a really bad dream...'
    

    tokenize paragraph to sentence:

    sentence = token_to_sentence(example)
    

    will result:

    ['Mary had a little lamb', 'Jack went up the hill', 'Jill followed suit', 'i woke up suddenly', 'it was a really bad dream']
    

    tokenize to words:

    words = token_to_words(sentence)
    

    will result:

    ['Mary', 'had', 'a', 'little', 'lamb', 'Jack', 'went, 'up', 'the', 'hill', 'Jill', 'followed', 'suit', 'i', 'woke', 'up', 'suddenly', 'it', 'was', 'a', 'really', 'bad', 'dream']
    

    i will explain how this work.

    first, i used regex to search all word and spaces which separate the words and stop until found a punctuation, the regex is:

    ([\w\s]{0,})[^\w\s]{0,}
    

    so the computation wil be took the words and spaces in bracket:

    '(Mary had a little lamb),( Jack went up the hill, Jill followed suit),( i woke up suddenly),( it was a really bad dream)...'
    

    the result is still not clear, contain some 'None' characters. so i used this script to removed the 'None' characters:

    [x for x in regex_of_sentence if x is not '']
    

    so the paragraph will tokenize to sentence, but not clear sentence the result is:

    ['Mary had a little lamb', ' Jack went up the hill', ' Jill followed suit', ' i woke up suddenly', ' it was a really bad dream']
    

    as you see the result show some sentence start by a space. so to make a clear paragraph without starting a space, i make this regex:

    \s([\w\s]{0,})
    

    it will make a clear sentence like:

    ['Mary had a little lamb', 'Jack went up the hill', 'Jill followed suit', 'i woke up suddenly', 'it was a really bad dream']
    

    so, we must make two process to make a good result.

    the answer of your question is start from here...

    to tokenize the sentence to words, i make the paragraph iteration and used regex just to capture the word while it was iterating with this regex:

    ([\w]{0,})
    

    and clear the empty characters again with:

    [x for x in regex_of_word if x is not '']
    

    so the result is really clear only the list of words:

    ['Mary', 'had', 'a', 'little', 'lamb', 'Jack', 'went, 'up', 'the', 'hill', 'Jill', 'followed', 'suit', 'i', 'woke', 'up', 'suddenly', 'it', 'was', 'a', 'really', 'bad', 'dream']
    

    in the future to make a good NLP, you need to have your own phrase database and search if the phrase is in the sentence, after make a list of phrase, the rest of words is clear a word.

    with this method, i can build my own NLP in my language (bahasa Indonesia) which really-really lack of module.

    edited:

    i don't see your question that want to compare the words. so you have another sentence to compare....i give you bonus not only bonus, i give you how to count it.

    mod_example = ["'Mary' 'had' 'a' 'little' 'lamb'" , 'Jack' 'went' 'up' 'the' 'hill']
    

    in this case the step you must do is: 1. iter the mod_example 2. compare the first sentence with the words from mod_example. 3. make some calculation

    so the script will be:

    import re
    from contextlib import redirect_stdout
    from io import StringIO
    
    example = 'Mary had a little lamb, Jack went up the hill, Jill followed suit, i woke up suddenly, it was a really bad dream...'
    mod_example = ["'Mary' 'had' 'a' 'little' 'lamb'" , 'Jack' 'went' 'up' 'the' 'hill']
    
    def token_to_sentence(str):
        f = StringIO()
        with redirect_stdout(f):
            regex_of_sentence = re.findall('([\w\s]{0,})[^\w\s]', str)
            regex_of_sentence = [x for x in regex_of_sentence if x is not '']
            for i in regex_of_sentence:
                print(i)
            first_step_to_sentence = (f.getvalue()).split('\n')
        g = StringIO()
        with redirect_stdout(g):
            for i in first_step_to_sentence:
                try:
                    regex_to_clear_sentence = re.search('\s([\w\s]{0,})', i)
                    print(regex_to_clear_sentence.group(1))
                except:
                    print(i)
            sentence = (g.getvalue()).split('\n')
        return sentence
    
    def token_to_words(str):
        f = StringIO()
        with redirect_stdout(f):
            for i in str:
                regex_of_word = re.findall('([\w]{0,})', i)
                regex_of_word = [x for x in regex_of_word if x is not '']
                for word in regex_of_word:
                    print(regex_of_word)
            words = (f.getvalue()).split('\n')
    
    def convert_to_words(str):
        sentences = token_to_sentence(str)
        for i in sentences:
            word = token_to_words(i)
        return word
    
    def compare_list_of_words__to_another_list_of_words(from_strA, to_strB):
            fromA = list(set(from_strA))
            for word_to_match in fromA:
                totalB = len(to_strB)
                number_of_match = (to_strB).count(word_to_match)
                data = str((((to_strB).count(word_to_match))/totalB)*100)
                print('words: -- ' + word_to_match + ' --' + '\n'
                '       number of match    : ' + number_of_match + ' from ' + str(totalB) + '\n'
                '       percent of match   : ' + data + ' percent')
    
    
    
    #prepare already make, now we will use it. The process start with script below:
    
    if __name__ == '__main__':
        #tokenize paragraph in example to sentence:
        getsentences = token_to_sentence(example)
    
        #tokenize sentence to words (sentences in getsentences)
        getwords = token_to_words(getsentences)
    
        #compare list of word in (getwords) with list of words in mod_example
        compare_list_of_words__to_another_list_of_words(getwords, mod_example)
    
    0 讨论(0)
  • 2021-02-04 07:24

    This also can be done by pytorch torchtext as

    from torchtext.data import get_tokenizer
    
    tokenizer = get_tokenizer('basic_english')
    example = ['Mary had a little lamb' , 
                'Jack went up the hill' , 
                'Jill followed suit' ,    
                'i woke up suddenly' ,
                'it was a really bad dream...']
    tokens = []
    for s in example:
        tokens += tokenizer(s)
    # ['mary', 'had', 'a', 'little', 'lamb', 'jack', 'went', 'up', 'the', 'hill', 'jill', 'followed', 'suit', 'i', 'woke', 'up', 'suddenly', 'it', 'was', 'a', 'really', 'bad', 'dream', '.', '.', '.']
    
    0 讨论(0)
提交回复
热议问题