I have a text file. I need to get a list of sentences.
How can this be implemented? There are a lot of subtleties, such as a dot being used in abbreviations.
This function can split the entire text of Huckleberry Finn into sentences in about 0.1 seconds and handles many of the more painful edge cases that make sentence parsing non-trivial e.g. "Mr. John Johnson Jr. was born in the U.S.A but earned his Ph.D. in Israel before joining Nike Inc. as an engineer. He also worked at craigslist.org as a business analyst."
# -*- coding: utf-8 -*-
import re
alphabets= "([A-Za-z])"
prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = "[.](com|net|org|io|gov)"
def split_into_sentences(text):
text = " " + text + " "
text = text.replace("\n"," ")
text = re.sub(prefixes,"\\1",text)
text = re.sub(websites,"\\1",text)
if "Ph.D" in text: text = text.replace("Ph.D.","PhD")
text = re.sub("\s" + alphabets + "[.] "," \\1 ",text)
text = re.sub(acronyms+" "+starters,"\\1 \\2",text)
text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1\\2\\3",text)
text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1\\2",text)
text = re.sub(" "+suffixes+"[.] "+starters," \\1 \\2",text)
text = re.sub(" "+suffixes+"[.]"," \\1",text)
text = re.sub(" " + alphabets + "[.]"," \\1",text)
if "”" in text: text = text.replace(".”","”.")
if "\"" in text: text = text.replace(".\"","\".")
if "!" in text: text = text.replace("!\"","\"!")
if "?" in text: text = text.replace("?\"","\"?")
text = text.replace(".",".")
text = text.replace("?","?")
text = text.replace("!","!")
text = text.replace("",".")
sentences = text.split("")
sentences = sentences[:-1]
sentences = [s.strip() for s in sentences]
return sentences