Split paragraphs into Sentences

前端未结

关注

 5  1187

囚心锁ツ

I have a large bunch of text. For example

I want to split a paragraph into sentences. But, there is a problem. My paragraph includes dates like Jan.13, 20

相关标签:

5条回答

暗喜

2021-01-23 21:29

Here is matt answer in swift 4

 func splitsentance(string: String) -> [String]{
    let s = string
    var r = [Range<String.Index>]()
    let t = s.linguisticTags(
        in: s.startIndex..<s.endIndex, scheme:    NSLinguisticTagScheme.lexicalClass.rawValue,
        options: [], tokenRanges: &r)
    var result = [String]()

    let ixs = t.enumerated().filter{
         $0.1 == "SentenceTerminator"
    }.map {r[$0.0].lowerBound}
    var prev = s.startIndex
    for ix in ixs {
        let r = prev...ix
        result.append(
            s[r].trimmingCharacters(in: CharacterSet.whitespacesAndNewlines))
        prev = ix
    }
    return result
}

0 讨论(0)

南旧

2021-01-23 21:33

Use NSLinguisticTagger. It gets the sentences right for your given input, because it analyzes in actual linguistic terms.

Here's a rough draft (Swift 1.2, this won't compile in Swift 2.0):

let s = "I want to split a paragraph into sentences. But, there is a problem. My paragraph includes dates like Jan.13, 2014 , words like U.A.E and numbers like 2.2. How do i split this."
var r = [Range<String.Index>]()
let t = s.linguisticTagsInRange(
    indices(s), scheme: NSLinguisticTagSchemeLexicalClass,
    options: nil, tokenRanges: &r)
var result = [String]()
let ixs = Array(enumerate(t)).filter {
    $0.1 == "SentenceTerminator"
    }.map {r[$0.0].startIndex}
var prev = s.startIndex
for ix in ixs {
    let r = prev...ix
    result.append(
        s[r].stringByTrimmingCharactersInSet(
             NSCharacterSet.whitespaceCharacterSet()))
    prev = advance(ix,1)
}

Here is a Swift 2.0 version (updated to Xcode 7 beta 6):

let s = "I want to split a paragraph into sentences. But, there is a problem. My paragraph includes dates like Jan.13, 2014 , words like U.A.E and numbers like 2.2. How do i split this."
var r = [Range<String.Index>]()
let t = s.linguisticTagsInRange(
    s.characters.indices, scheme: NSLinguisticTagSchemeLexicalClass,
    tokenRanges: &r)
var result = [String]()
let ixs = t.enumerate().filter {
    $0.1 == "SentenceTerminator"
}.map {r[$0.0].startIndex}
var prev = s.startIndex
for ix in ixs {
    let r = prev...ix
    result.append(
        s[r].stringByTrimmingCharactersInSet(
            NSCharacterSet.whitespaceCharacterSet()))
    prev = ix.advancedBy(1)
}

And here it is updated for Swift 3:

let s = "I want to split a paragraph into sentences. But, there is a problem. My paragraph includes dates like Jan.13, 2014 , words like U.A.E and numbers like 2.2. How do i split this."
var r = [Range<String.Index>]()
let t = s.linguisticTags(
    in: s.startIndex..<s.endIndex,
    scheme: NSLinguisticTagSchemeLexicalClass,
    tokenRanges: &r)
var result = [String]()
let ixs = t.enumerated().filter {
    $0.1 == "SentenceTerminator"
    }.map {r[$0.0].lowerBound}
var prev = s.startIndex
for ix in ixs {
    let r = prev...ix
    result.append(
        s[r].trimmingCharacters(
            in: NSCharacterSet.whitespaces))
    prev = s.index(after: ix)
}

result is an array of four strings, one sentence per string:

["I want to split a paragraph into sentences.", 
 "But, there is a problem.", 
 "My paragraph includes dates like Jan.13, 2014 , words like U.A.E and numbers like 2.2.", 
 "How do i split this."]

0 讨论(0)

醉梦人生

2021-01-23 21:51

This is a rough version of I believe you were looking for: I an running a loop through the characters looking for the combination of ". "

As the loop runs the characters are added to currentSentence String?. When the combination is found, the currentSentence is added to sentences[sentenceNumber].

In addition, 2 exceptions have to be caught, the first whent he loop is on iteration 2 as period == index-1. The second is the last sentence as there is no space after the period.

var paragraph = "I want to split a paragraph into sentences. But, there is a problem. My paragraph includes dates like Jan.13, 2014 , words like U.A.E abd numbers like 2.2. How do I split this."

var sentences = [String]()
var sentenceNumber = 0
var currentSentence: String? = ""

var charArray = paragraph.characters
var period = 0

for (index, char) in charArray.enumerate() {
    currentSentence! += "\(char)"
    if (char == ".") {
        period = index

        if (period == charArray.count-1) {
            sentences.append(currentSentence!)
        }
    } else if ((char == " " && period == index-1 && index != 1) || period == (charArray.count-1)) {

        sentences.append(currentSentence!)
        print(period)
        currentSentence = ""
        sentenceNumber++
    }
}

0 讨论(0)

有刺的猬

2021-01-23 21:51

Enumerating by linguistic tags feels like an efficient way of handling this task. We can eliminate overheads for storing superfluous stings.

var paragraph = """
    I want to split a paragraph into sentences. But, there is a problem.
    My paragraph includes dates like Jan.13, 2014 , words like U.A.E abd numbers like 2.2. And emojis like


          	          
            
           
            
                              
                
              
              
                
                  说谎        
                
              
                            
                2021-01-23 21:51
              
            
            
                                                                       
NSLinguisticTagger is deprecated. Using NLTagger instead. (iOS 12.0+, macOS 10.14+)
import NaturalLanguage

var str = "I want to split a paragraph into sentences. But, there is a problem. My paragraph includes dates like Jan.13, 2014 , words like U.A.E and numbers like 2.2. How do i split this."

func splitSentenceFrom(text: String) -> [String] {
    var result: [String] = []
    let tagger = NLTagger(tagSchemes: [.lexicalClass])
    tagger.string = text
    tagger.enumerateTags(in: text.startIndex..<text.endIndex, unit: .sentence, scheme: .lexicalClass) { (tag, tokenRange) -> Bool in
        result.append(String(text[tokenRange]))
        return true
    }
    return result
}

let sentences = splitSentenceFrom(text: str)

sentences.forEach {
    print($0)
}

output:
I want to split a paragraph into sentences. 
But, there is a problem. 
My paragraph includes dates like Jan.13, 2014 , words like U.A.E and numbers like 2.2. 
How do i split this.

want to exclude empty sentences and trim whitespace? add this
let sentence = String(text[tokenRange]).trimmingCharacters(in: .whitespacesAndNewlines)
if sentence.count > 0 {
    result.append(sentence)
}

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...