Extract meter levels from audio file

前端未结

关注

 2  647

醉梦人生 2021-01-31 19:22

I need to extract audio meter levels from a file so I can render the levels before playing the audio. I know AVAudioPlayer can get this information while playing th

2条回答

温柔的废话 (楼主)

2021-01-31 20:07

Swift 4

It takes on an iPhone:

0.538s to process an 8MByte mp3 player with a 4min47s duration, and 44,100 sampling rate
0.170s to process an 712KByte mp3 player with a 22s duration, and 44,100 sampling rate
0.089s to process caffile created by converting the file above using this command afconvert -f caff -d LEI16 audio.mp3 audio.caf in the terminal.

Let's begin:

A) Declare this class that is going to hold the necessary information about the audio asset:

/// Holds audio information used for building waveforms
final class AudioContext {
    
    /// The audio asset URL used to load the context
    public let audioURL: URL
    
    /// Total number of samples in loaded asset
    public let totalSamples: Int
    
    /// Loaded asset
    public let asset: AVAsset
    
    // Loaded assetTrack
    public let assetTrack: AVAssetTrack
    
    private init(audioURL: URL, totalSamples: Int, asset: AVAsset, assetTrack: AVAssetTrack) {
        self.audioURL = audioURL
        self.totalSamples = totalSamples
        self.asset = asset
        self.assetTrack = assetTrack
    }
    
    public static func load(fromAudioURL audioURL: URL, completionHandler: @escaping (_ audioContext: AudioContext?) -> ()) {
        let asset = AVURLAsset(url: audioURL, options: [AVURLAssetPreferPreciseDurationAndTimingKey: NSNumber(value: true as Bool)])
        
        guard let assetTrack = asset.tracks(withMediaType: AVMediaType.audio).first else {
            fatalError("Couldn't load AVAssetTrack")
        }
        
        asset.loadValuesAsynchronously(forKeys: ["duration"]) {
            var error: NSError?
            let status = asset.statusOfValue(forKey: "duration", error: &error)
            switch status {
            case .loaded:
                guard
                    let formatDescriptions = assetTrack.formatDescriptions as? [CMAudioFormatDescription],
                    let audioFormatDesc = formatDescriptions.first,
                    let asbd = CMAudioFormatDescriptionGetStreamBasicDescription(audioFormatDesc)
                    else { break }
                
                let totalSamples = Int((asbd.pointee.mSampleRate) * Float64(asset.duration.value) / Float64(asset.duration.timescale))
                let audioContext = AudioContext(audioURL: audioURL, totalSamples: totalSamples, asset: asset, assetTrack: assetTrack)
                completionHandler(audioContext)
                return
                
            case .failed, .cancelled, .loading, .unknown:
                print("Couldn't load asset: \(error?.localizedDescription ?? "Unknown error")")
            }
            
            completionHandler(nil)
        }
    }
}

We are going to use its asynchronous function load, and handle its result to a completion handler.

B) Import AVFoundation and Accelerate in your view controller:

import AVFoundation
import Accelerate

C) Declare the noise level in your view controller (in dB):

let noiseFloor: Float = -80

For example, anything less than -80dB will be considered as silence.

D) The following function takes an audio context and produces the desired dB powers. targetSamples is by default set to 100, you can change that to suit your UI needs:

func render(audioContext: AudioContext?, targetSamples: Int = 100) -> [Float]{
    guard let audioContext = audioContext else {
        fatalError("Couldn't create the audioContext")
    }
    
    let sampleRange: CountableRange = 0..?
        CMBlockBufferGetDataPointer(readBuffer, 0, &readBufferLength, nil, &readBufferPointer)
        sampleBuffer.append(UnsafeBufferPointer(start: readBufferPointer, count: readBufferLength))
        CMSampleBufferInvalidate(readSampleBuffer)
        
        let totalSamples = sampleBuffer.count / MemoryLayout.size
        let downSampledLength = totalSamples / samplesPerPixel
        let samplesToProcess = downSampledLength * samplesPerPixel
        
        guard samplesToProcess > 0 else { continue }
        
        processSamples(fromData: &sampleBuffer,
                       outputSamples: &outputSamples,
                       samplesToProcess: samplesToProcess,
                       downSampledLength: downSampledLength,
                       samplesPerPixel: samplesPerPixel,
                       filter: filter)
        //print("Status: \(reader.status)")
    }
    
    // Process the remaining samples at the end which didn't fit into samplesPerPixel
    let samplesToProcess = sampleBuffer.count / MemoryLayout.size
    if samplesToProcess > 0 {
        let downSampledLength = 1
        let samplesPerPixel = samplesToProcess
        let filter = [Float](repeating: 1.0 / Float(samplesPerPixel), count: samplesPerPixel)
        
        processSamples(fromData: &sampleBuffer,
                       outputSamples: &outputSamples,
                       samplesToProcess: samplesToProcess,
                       downSampledLength: downSampledLength,
                       samplesPerPixel: samplesPerPixel,
                       filter: filter)
        //print("Status: \(reader.status)")
    }
    
    // if (reader.status == AVAssetReaderStatusFailed || reader.status == AVAssetReaderStatusUnknown)
    guard reader.status == .completed else {
        fatalError("Couldn't read the audio file")
    }
    
    return outputSamples
}

E) render uses this function to down-sample the data from the audio file, and convert to decibels:

func processSamples(fromData sampleBuffer: inout Data,
                    outputSamples: inout [Float],
                    samplesToProcess: Int,
                    downSampledLength: Int,
                    samplesPerPixel: Int,
                    filter: [Float]) {
    sampleBuffer.withUnsafeBytes { (samples: UnsafePointer) in
        var processingBuffer = [Float](repeating: 0.0, count: samplesToProcess)
        
        let sampleCount = vDSP_Length(samplesToProcess)
        
        //Convert 16bit int samples to floats
        vDSP_vflt16(samples, 1, &processingBuffer, 1, sampleCount)
        
        //Take the absolute values to get amplitude
        vDSP_vabs(processingBuffer, 1, &processingBuffer, 1, sampleCount)
        
        //get the corresponding dB, and clip the results
        getdB(from: &processingBuffer)
        
        //Downsample and average
        var downSampledData = [Float](repeating: 0.0, count: downSampledLength)
        vDSP_desamp(processingBuffer,
                    vDSP_Stride(samplesPerPixel),
                    filter, &downSampledData,
                    vDSP_Length(downSampledLength),
                    vDSP_Length(samplesPerPixel))
        
        //Remove processed samples
        sampleBuffer.removeFirst(samplesToProcess * MemoryLayout.size)
        
        outputSamples += downSampledData
    }
}

F) Which in turn calls this function that gets the corresponding dB, and clips the results to [noiseFloor, 0]:

func getdB(from normalizedSamples: inout [Float]) {
    // Convert samples to a log scale
    var zero: Float = 32768.0
    vDSP_vdbcon(normalizedSamples, 1, &zero, &normalizedSamples, 1, vDSP_Length(normalizedSamples.count), 1)
    
    //Clip to [noiseFloor, 0]
    var ceil: Float = 0.0
    var noiseFloorMutable = noiseFloor
    vDSP_vclip(normalizedSamples, 1, &noiseFloorMutable, &ceil, &normalizedSamples, 1, vDSP_Length(normalizedSamples.count))
}

G) Finally you can get the waveform of the audio like so:

guard let path = Bundle.main.path(forResource: "audio", ofType:"mp3") else {
    fatalError("Couldn't find the file path")
}
let url = URL(fileURLWithPath: path)
var outputArray : [Float] = []
AudioContext.load(fromAudioURL: url, completionHandler: { audioContext in
    guard let audioContext = audioContext else {
        fatalError("Couldn't create the audioContext")
    }
    outputArray = self.render(audioContext: audioContext, targetSamples: 300)
})

Don't forget that AudioContext.load(fromAudioURL:) is asynchronous.

This solution is synthesized from this repo by William Entriken. All credit goes to him.

Swift 5

Here is the same code updated to Swift 5 syntax:

import AVFoundation
import Accelerate

/// Holds audio information used for building waveforms
final class AudioContext {
    
    /// The audio asset URL used to load the context
    public let audioURL: URL
    
    /// Total number of samples in loaded asset
    public let totalSamples: Int
    
    /// Loaded asset
    public let asset: AVAsset
    
    // Loaded assetTrack
    public let assetTrack: AVAssetTrack
    
    private init(audioURL: URL, totalSamples: Int, asset: AVAsset, assetTrack: AVAssetTrack) {
        self.audioURL = audioURL
        self.totalSamples = totalSamples
        self.asset = asset
        self.assetTrack = assetTrack
    }
    
    public static func load(fromAudioURL audioURL: URL, completionHandler: @escaping (_ audioContext: AudioContext?) -> ()) {
        let asset = AVURLAsset(url: audioURL, options: [AVURLAssetPreferPreciseDurationAndTimingKey: NSNumber(value: true as Bool)])
        
        guard let assetTrack = asset.tracks(withMediaType: AVMediaType.audio).first else {
            fatalError("Couldn't load AVAssetTrack")
        }
        
        asset.loadValuesAsynchronously(forKeys: ["duration"]) {
            var error: NSError?
            let status = asset.statusOfValue(forKey: "duration", error: &error)
            switch status {
            case .loaded:
                guard
                    let formatDescriptions = assetTrack.formatDescriptions as? [CMAudioFormatDescription],
                    let audioFormatDesc = formatDescriptions.first,
                    let asbd = CMAudioFormatDescriptionGetStreamBasicDescription(audioFormatDesc)
                    else { break }
                
                let totalSamples = Int((asbd.pointee.mSampleRate) * Float64(asset.duration.value) / Float64(asset.duration.timescale))
                let audioContext = AudioContext(audioURL: audioURL, totalSamples: totalSamples, asset: asset, assetTrack: assetTrack)
                completionHandler(audioContext)
                return
                
            case .failed, .cancelled, .loading, .unknown:
                print("Couldn't load asset: \(error?.localizedDescription ?? "Unknown error")")
            }
            
            completionHandler(nil)
        }
    }
}

let noiseFloor: Float = -80

func render(audioContext: AudioContext?, targetSamples: Int = 100) -> [Float]{
    guard let audioContext = audioContext else {
        fatalError("Couldn't create the audioContext")
    }
    
    let sampleRange: CountableRange = 0..?
        CMBlockBufferGetDataPointer(readBuffer,
                                    atOffset: 0,
                                    lengthAtOffsetOut: &readBufferLength,
                                    totalLengthOut: nil,
                                    dataPointerOut: &readBufferPointer)
        sampleBuffer.append(UnsafeBufferPointer(start: readBufferPointer, count: readBufferLength))
        CMSampleBufferInvalidate(readSampleBuffer)
        
        let totalSamples = sampleBuffer.count / MemoryLayout.size
        let downSampledLength = totalSamples / samplesPerPixel
        let samplesToProcess = downSampledLength * samplesPerPixel
        
        guard samplesToProcess > 0 else { continue }
        
        processSamples(fromData: &sampleBuffer,
                       outputSamples: &outputSamples,
                       samplesToProcess: samplesToProcess,
                       downSampledLength: downSampledLength,
                       samplesPerPixel: samplesPerPixel,
                       filter: filter)
        //print("Status: \(reader.status)")
    }
    
    // Process the remaining samples at the end which didn't fit into samplesPerPixel
    let samplesToProcess = sampleBuffer.count / MemoryLayout.size
    if samplesToProcess > 0 {
        let downSampledLength = 1
        let samplesPerPixel = samplesToProcess
        let filter = [Float](repeating: 1.0 / Float(samplesPerPixel), count: samplesPerPixel)
        
        processSamples(fromData: &sampleBuffer,
                       outputSamples: &outputSamples,
                       samplesToProcess: samplesToProcess,
                       downSampledLength: downSampledLength,
                       samplesPerPixel: samplesPerPixel,
                       filter: filter)
        //print("Status: \(reader.status)")
    }
    
    // if (reader.status == AVAssetReaderStatusFailed || reader.status == AVAssetReaderStatusUnknown)
    guard reader.status == .completed else {
        fatalError("Couldn't read the audio file")
    }
    
    return outputSamples
}

func processSamples(fromData sampleBuffer: inout Data,
                    outputSamples: inout [Float],
                    samplesToProcess: Int,
                    downSampledLength: Int,
                    samplesPerPixel: Int,
                    filter: [Float]) {
    
    sampleBuffer.withUnsafeBytes { (samples: UnsafeRawBufferPointer) in
        var processingBuffer = [Float](repeating: 0.0, count: samplesToProcess)
        
        let sampleCount = vDSP_Length(samplesToProcess)
        
        //Create an UnsafePointer from samples
        let unsafeBufferPointer = samples.bindMemory(to: Int16.self)
        let unsafePointer = unsafeBufferPointer.baseAddress!
        
        //Convert 16bit int samples to floats
        vDSP_vflt16(unsafePointer, 1, &processingBuffer, 1, sampleCount)
        
        //Take the absolute values to get amplitude
        vDSP_vabs(processingBuffer, 1, &processingBuffer, 1, sampleCount)
        
        //get the corresponding dB, and clip the results
        getdB(from: &processingBuffer)
        
        //Downsample and average
        var downSampledData = [Float](repeating: 0.0, count: downSampledLength)
        vDSP_desamp(processingBuffer,
                    vDSP_Stride(samplesPerPixel),
                    filter, &downSampledData,
                    vDSP_Length(downSampledLength),
                    vDSP_Length(samplesPerPixel))
        
        //Remove processed samples
        sampleBuffer.removeFirst(samplesToProcess * MemoryLayout.size)
        
        outputSamples += downSampledData
    }
}

func getdB(from normalizedSamples: inout [Float]) {
    // Convert samples to a log scale
    var zero: Float = 32768.0
    vDSP_vdbcon(normalizedSamples, 1, &zero, &normalizedSamples, 1, vDSP_Length(normalizedSamples.count), 1)
    
    //Clip to [noiseFloor, 0]
    var ceil: Float = 0.0
    var noiseFloorMutable = noiseFloor
    vDSP_vclip(normalizedSamples, 1, &noiseFloorMutable, &ceil, &normalizedSamples, 1, vDSP_Length(normalizedSamples.count))
}

Old solution

Here is a function you could use to pre-render the meter levels of an audio file without playing it:

func averagePowers(audioFileURL: URL, forChannel channelNumber: Int, completionHandler: @escaping(_ success: [Float]) -> ()) {
    let audioFile = try! AVAudioFile(forReading: audioFileURL)
    let audioFilePFormat = audioFile.processingFormat
    let audioFileLength = audioFile.length
    
    //Set the size of frames to read from the audio file, you can adjust this to your liking
    let frameSizeToRead = Int(audioFilePFormat.sampleRate/20)
    
    //This is to how many frames/portions we're going to divide the audio file
    let numberOfFrames = Int(audioFileLength)/frameSizeToRead
    
    //Create a pcm buffer the size of a frame
    guard let audioBuffer = AVAudioPCMBuffer(pcmFormat: audioFilePFormat, frameCapacity: AVAudioFrameCount(frameSizeToRead)) else {
        fatalError("Couldn't create the audio buffer")
    }
    
    //Do the calculations in a background thread, if you don't want to block the main thread for larger audio files
    DispatchQueue.global(qos: .userInitiated).async {
        
        //This is the array to be returned
        var returnArray : [Float] = [Float]()
        
        //We're going to read the audio file, frame by frame
        for i in 0.. 0.000_000_01 ? 20 * log10(meanValue) : -160.0
            
            //append the db power in the current frame to the returnArray
            returnArray.append(dbPower)
        }
        
        //Return the dBPowers
        completionHandler(returnArray)
    }
}

And you can call it like so:

let path = Bundle.main.path(forResource: "audio.mp3", ofType:nil)!
let url = URL(fileURLWithPath: path)
averagePowers(audioFileURL: url, forChannel: 0, completionHandler: { array in
    //Use the array
})

Using instruments, this solution makes high cpu usage during 1.2 seconds, takes about 5 seconds to return to the main thread with the returnArray, and up to 10 seconds when on low battery mode.

0 讨论(0)

查看其它2个回答