I have a large bunch of text. For example
I want to split a paragraph into sentences. But, there is a problem. My paragraph includes dates like Jan.13, 20
Here is matt answer in swift 4
func splitsentance(string: String) -> [String]{
let s = string
var r = [Range<String.Index>]()
let t = s.linguisticTags(
in: s.startIndex..<s.endIndex, scheme: NSLinguisticTagScheme.lexicalClass.rawValue,
options: [], tokenRanges: &r)
var result = [String]()
let ixs = t.enumerated().filter{
$0.1 == "SentenceTerminator"
}.map {r[$0.0].lowerBound}
var prev = s.startIndex
for ix in ixs {
let r = prev...ix
result.append(
s[r].trimmingCharacters(in: CharacterSet.whitespacesAndNewlines))
prev = ix
}
return result
}
Use NSLinguisticTagger. It gets the sentences right for your given input, because it analyzes in actual linguistic terms.
Here's a rough draft (Swift 1.2, this won't compile in Swift 2.0):
let s = "I want to split a paragraph into sentences. But, there is a problem. My paragraph includes dates like Jan.13, 2014 , words like U.A.E and numbers like 2.2. How do i split this."
var r = [Range<String.Index>]()
let t = s.linguisticTagsInRange(
indices(s), scheme: NSLinguisticTagSchemeLexicalClass,
options: nil, tokenRanges: &r)
var result = [String]()
let ixs = Array(enumerate(t)).filter {
$0.1 == "SentenceTerminator"
}.map {r[$0.0].startIndex}
var prev = s.startIndex
for ix in ixs {
let r = prev...ix
result.append(
s[r].stringByTrimmingCharactersInSet(
NSCharacterSet.whitespaceCharacterSet()))
prev = advance(ix,1)
}
Here is a Swift 2.0 version (updated to Xcode 7 beta 6):
let s = "I want to split a paragraph into sentences. But, there is a problem. My paragraph includes dates like Jan.13, 2014 , words like U.A.E and numbers like 2.2. How do i split this."
var r = [Range<String.Index>]()
let t = s.linguisticTagsInRange(
s.characters.indices, scheme: NSLinguisticTagSchemeLexicalClass,
tokenRanges: &r)
var result = [String]()
let ixs = t.enumerate().filter {
$0.1 == "SentenceTerminator"
}.map {r[$0.0].startIndex}
var prev = s.startIndex
for ix in ixs {
let r = prev...ix
result.append(
s[r].stringByTrimmingCharactersInSet(
NSCharacterSet.whitespaceCharacterSet()))
prev = ix.advancedBy(1)
}
And here it is updated for Swift 3:
let s = "I want to split a paragraph into sentences. But, there is a problem. My paragraph includes dates like Jan.13, 2014 , words like U.A.E and numbers like 2.2. How do i split this."
var r = [Range<String.Index>]()
let t = s.linguisticTags(
in: s.startIndex..<s.endIndex,
scheme: NSLinguisticTagSchemeLexicalClass,
tokenRanges: &r)
var result = [String]()
let ixs = t.enumerated().filter {
$0.1 == "SentenceTerminator"
}.map {r[$0.0].lowerBound}
var prev = s.startIndex
for ix in ixs {
let r = prev...ix
result.append(
s[r].trimmingCharacters(
in: NSCharacterSet.whitespaces))
prev = s.index(after: ix)
}
result
is an array of four strings, one sentence per string:
["I want to split a paragraph into sentences.",
"But, there is a problem.",
"My paragraph includes dates like Jan.13, 2014 , words like U.A.E and numbers like 2.2.",
"How do i split this."]
This is a rough version of I believe you were looking for: I an running a loop through the characters looking for the combination of ". "
As the loop runs the characters are added to currentSentence String?
. When the combination is found, the currentSentence
is added to sentences[sentenceNumber]
.
In addition, 2 exceptions have to be caught, the first whent he loop is on iteration 2 as period == index-1
. The second is the last sentence as there is no space after the period.
var paragraph = "I want to split a paragraph into sentences. But, there is a problem. My paragraph includes dates like Jan.13, 2014 , words like U.A.E abd numbers like 2.2. How do I split this."
var sentences = [String]()
var sentenceNumber = 0
var currentSentence: String? = ""
var charArray = paragraph.characters
var period = 0
for (index, char) in charArray.enumerate() {
currentSentence! += "\(char)"
if (char == ".") {
period = index
if (period == charArray.count-1) {
sentences.append(currentSentence!)
}
} else if ((char == " " && period == index-1 && index != 1) || period == (charArray.count-1)) {
sentences.append(currentSentence!)
print(period)
currentSentence = ""
sentenceNumber++
}
}
Enumerating by linguistic tags feels like an efficient way of handling this task. We can eliminate overheads for storing superfluous stings.
var paragraph = """
I want to split a paragraph into sentences. But, there is a problem.
My paragraph includes dates like Jan.13, 2014 , words like U.A.E abd numbers like 2.2. And emojis like
NSLinguisticTagger
is deprecated. Using NLTagger
instead. (iOS 12.0+, macOS 10.14+)
import NaturalLanguage
var str = "I want to split a paragraph into sentences. But, there is a problem. My paragraph includes dates like Jan.13, 2014 , words like U.A.E and numbers like 2.2. How do i split this."
func splitSentenceFrom(text: String) -> [String] {
var result: [String] = []
let tagger = NLTagger(tagSchemes: [.lexicalClass])
tagger.string = text
tagger.enumerateTags(in: text.startIndex..<text.endIndex, unit: .sentence, scheme: .lexicalClass) { (tag, tokenRange) -> Bool in
result.append(String(text[tokenRange]))
return true
}
return result
}
let sentences = splitSentenceFrom(text: str)
sentences.forEach {
print($0)
}
output:
I want to split a paragraph into sentences.
But, there is a problem.
My paragraph includes dates like Jan.13, 2014 , words like U.A.E and numbers like 2.2.
How do i split this.
want to exclude empty sentences and trim whitespace? add this
let sentence = String(text[tokenRange]).trimmingCharacters(in: .whitespacesAndNewlines)
if sentence.count > 0 {
result.append(sentence)
}