I am trying to split a term which contains a hashtag of multiple words such as \"#I-am-great\" or \"#awesome-dayofmylife\'
then the output that I am looking for is:
The problem can be broken down to several steps:
Here is one solution using this approach:
# Returns a list of common english terms (words)
def initialize_words():
content = None
with open('C:\wordlist.txt') as f: # A file containing common english words
content = f.readlines()
return [word.rstrip('\n') for word in content]
def parse_sentence(sentence, wordlist):
new_sentence = "" # output
terms = sentence.split(' ')
for term in terms:
if term[0] == '#': # this is a hashtag, parse it
new_sentence += parse_tag(term, wordlist)
else: # Just append the word
new_sentence += term
new_sentence += " "
return new_sentence
def parse_tag(term, wordlist):
words = []
# Remove hashtag, split by dash
tags = term[1:].split('-')
for tag in tags:
word = find_word(tag, wordlist)
while word != None and len(tag) > 0:
words.append(word)
if len(tag) == len(word): # Special case for when eating rest of word
break
tag = tag[len(word):]
word = find_word(tag, wordlist)
return " ".join(words)
def find_word(token, wordlist):
i = len(token) + 1
while i > 1:
i -= 1
if token[:i] in wordlist:
return token[:i]
return None
wordlist = initialize_words()
sentence = "big #awesome-dayofmylife because #iamgreat"
parse_sentence(sentence, wordlist)
It prints:
'big awe some day of my life because i am great '
You will have to remove the trailing space, but that's easy. :)
I got the wordlist from http://www-personal.umich.edu/~jlawler/wordlist.