Extract meter levels from audio file

前端 未结 2 647
醉梦人生
醉梦人生 2021-01-31 19:22

I need to extract audio meter levels from a file so I can render the levels before playing the audio. I know AVAudioPlayer can get this information while playing th

2条回答
  •  温柔的废话
    2021-01-31 20:07

    Swift 4

    It takes on an iPhone:

    • 0.538s to process an 8MByte mp3 player with a 4min47s duration, and 44,100 sampling rate

    • 0.170s to process an 712KByte mp3 player with a 22s duration, and 44,100 sampling rate

    • 0.089s to process caffile created by converting the file above using this command afconvert -f caff -d LEI16 audio.mp3 audio.caf in the terminal.

    Let's begin:

    A) Declare this class that is going to hold the necessary information about the audio asset:

    /// Holds audio information used for building waveforms
    final class AudioContext {
        
        /// The audio asset URL used to load the context
        public let audioURL: URL
        
        /// Total number of samples in loaded asset
        public let totalSamples: Int
        
        /// Loaded asset
        public let asset: AVAsset
        
        // Loaded assetTrack
        public let assetTrack: AVAssetTrack
        
        private init(audioURL: URL, totalSamples: Int, asset: AVAsset, assetTrack: AVAssetTrack) {
            self.audioURL = audioURL
            self.totalSamples = totalSamples
            self.asset = asset
            self.assetTrack = assetTrack
        }
        
        public static func load(fromAudioURL audioURL: URL, completionHandler: @escaping (_ audioContext: AudioContext?) -> ()) {
            let asset = AVURLAsset(url: audioURL, options: [AVURLAssetPreferPreciseDurationAndTimingKey: NSNumber(value: true as Bool)])
            
            guard let assetTrack = asset.tracks(withMediaType: AVMediaType.audio).first else {
                fatalError("Couldn't load AVAssetTrack")
            }
            
            asset.loadValuesAsynchronously(forKeys: ["duration"]) {
                var error: NSError?
                let status = asset.statusOfValue(forKey: "duration", error: &error)
                switch status {
                case .loaded:
                    guard
                        let formatDescriptions = assetTrack.formatDescriptions as? [CMAudioFormatDescription],
                        let audioFormatDesc = formatDescriptions.first,
                        let asbd = CMAudioFormatDescriptionGetStreamBasicDescription(audioFormatDesc)
                        else { break }
                    
                    let totalSamples = Int((asbd.pointee.mSampleRate) * Float64(asset.duration.value) / Float64(asset.duration.timescale))
                    let audioContext = AudioContext(audioURL: audioURL, totalSamples: totalSamples, asset: asset, assetTrack: assetTrack)
                    completionHandler(audioContext)
                    return
                    
                case .failed, .cancelled, .loading, .unknown:
                    print("Couldn't load asset: \(error?.localizedDescription ?? "Unknown error")")
                }
                
                completionHandler(nil)
            }
        }
    }
    

    We are going to use its asynchronous function load, and handle its result to a completion handler.

    B) Import AVFoundation and Accelerate in your view controller:

    import AVFoundation
    import Accelerate
    

    C) Declare the noise level in your view controller (in dB):

    let noiseFloor: Float = -80
    

    For example, anything less than -80dB will be considered as silence.

    D) The following function takes an audio context and produces the desired dB powers. targetSamples is by default set to 100, you can change that to suit your UI needs:

    func render(audioContext: AudioContext?, targetSamples: Int = 100) -> [Float]{
        guard let audioContext = audioContext else {
            fatalError("Couldn't create the audioContext")
        }
        
        let sampleRange: CountableRange = 0..?
            CMBlockBufferGetDataPointer(readBuffer, 0, &readBufferLength, nil, &readBufferPointer)
            sampleBuffer.append(UnsafeBufferPointer(start: readBufferPointer, count: readBufferLength))
            CMSampleBufferInvalidate(readSampleBuffer)
            
            let totalSamples = sampleBuffer.count / MemoryLayout.size
            let downSampledLength = totalSamples / samplesPerPixel
            let samplesToProcess = downSampledLength * samplesPerPixel
            
            guard samplesToProcess > 0 else { continue }
            
            processSamples(fromData: &sampleBuffer,
                           outputSamples: &outputSamples,
                           samplesToProcess: samplesToProcess,
                           downSampledLength: downSampledLength,
                           samplesPerPixel: samplesPerPixel,
                           filter: filter)
            //print("Status: \(reader.status)")
        }
        
        // Process the remaining samples at the end which didn't fit into samplesPerPixel
        let samplesToProcess = sampleBuffer.count / MemoryLayout.size
        if samplesToProcess > 0 {
            let downSampledLength = 1
            let samplesPerPixel = samplesToProcess
            let filter = [Float](repeating: 1.0 / Float(samplesPerPixel), count: samplesPerPixel)
            
            processSamples(fromData: &sampleBuffer,
                           outputSamples: &outputSamples,
                           samplesToProcess: samplesToProcess,
                           downSampledLength: downSampledLength,
                           samplesPerPixel: samplesPerPixel,
                           filter: filter)
            //print("Status: \(reader.status)")
        }
        
        // if (reader.status == AVAssetReaderStatusFailed || reader.status == AVAssetReaderStatusUnknown)
        guard reader.status == .completed else {
            fatalError("Couldn't read the audio file")
        }
        
        return outputSamples
    }
    

    E) render uses this function to down-sample the data from the audio file, and convert to decibels:

    func processSamples(fromData sampleBuffer: inout Data,
                        outputSamples: inout [Float],
                        samplesToProcess: Int,
                        downSampledLength: Int,
                        samplesPerPixel: Int,
                        filter: [Float]) {
        sampleBuffer.withUnsafeBytes { (samples: UnsafePointer) in
            var processingBuffer = [Float](repeating: 0.0, count: samplesToProcess)
            
            let sampleCount = vDSP_Length(samplesToProcess)
            
            //Convert 16bit int samples to floats
            vDSP_vflt16(samples, 1, &processingBuffer, 1, sampleCount)
            
            //Take the absolute values to get amplitude
            vDSP_vabs(processingBuffer, 1, &processingBuffer, 1, sampleCount)
            
            //get the corresponding dB, and clip the results
            getdB(from: &processingBuffer)
            
            //Downsample and average
            var downSampledData = [Float](repeating: 0.0, count: downSampledLength)
            vDSP_desamp(processingBuffer,
                        vDSP_Stride(samplesPerPixel),
                        filter, &downSampledData,
                        vDSP_Length(downSampledLength),
                        vDSP_Length(samplesPerPixel))
            
            //Remove processed samples
            sampleBuffer.removeFirst(samplesToProcess * MemoryLayout.size)
            
            outputSamples += downSampledData
        }
    }
    

    F) Which in turn calls this function that gets the corresponding dB, and clips the results to [noiseFloor, 0]:

    func getdB(from normalizedSamples: inout [Float]) {
        // Convert samples to a log scale
        var zero: Float = 32768.0
        vDSP_vdbcon(normalizedSamples, 1, &zero, &normalizedSamples, 1, vDSP_Length(normalizedSamples.count), 1)
        
        //Clip to [noiseFloor, 0]
        var ceil: Float = 0.0
        var noiseFloorMutable = noiseFloor
        vDSP_vclip(normalizedSamples, 1, &noiseFloorMutable, &ceil, &normalizedSamples, 1, vDSP_Length(normalizedSamples.count))
    }
    

    G) Finally you can get the waveform of the audio like so:

    guard let path = Bundle.main.path(forResource: "audio", ofType:"mp3") else {
        fatalError("Couldn't find the file path")
    }
    let url = URL(fileURLWithPath: path)
    var outputArray : [Float] = []
    AudioContext.load(fromAudioURL: url, completionHandler: { audioContext in
        guard let audioContext = audioContext else {
            fatalError("Couldn't create the audioContext")
        }
        outputArray = self.render(audioContext: audioContext, targetSamples: 300)
    })
    

    Don't forget that AudioContext.load(fromAudioURL:) is asynchronous.

    This solution is synthesized from this repo by William Entriken. All credit goes to him.


    Swift 5

    Here is the same code updated to Swift 5 syntax:

    import AVFoundation
    import Accelerate
    
    /// Holds audio information used for building waveforms
    final class AudioContext {
        
        /// The audio asset URL used to load the context
        public let audioURL: URL
        
        /// Total number of samples in loaded asset
        public let totalSamples: Int
        
        /// Loaded asset
        public let asset: AVAsset
        
        // Loaded assetTrack
        public let assetTrack: AVAssetTrack
        
        private init(audioURL: URL, totalSamples: Int, asset: AVAsset, assetTrack: AVAssetTrack) {
            self.audioURL = audioURL
            self.totalSamples = totalSamples
            self.asset = asset
            self.assetTrack = assetTrack
        }
        
        public static func load(fromAudioURL audioURL: URL, completionHandler: @escaping (_ audioContext: AudioContext?) -> ()) {
            let asset = AVURLAsset(url: audioURL, options: [AVURLAssetPreferPreciseDurationAndTimingKey: NSNumber(value: true as Bool)])
            
            guard let assetTrack = asset.tracks(withMediaType: AVMediaType.audio).first else {
                fatalError("Couldn't load AVAssetTrack")
            }
            
            asset.loadValuesAsynchronously(forKeys: ["duration"]) {
                var error: NSError?
                let status = asset.statusOfValue(forKey: "duration", error: &error)
                switch status {
                case .loaded:
                    guard
                        let formatDescriptions = assetTrack.formatDescriptions as? [CMAudioFormatDescription],
                        let audioFormatDesc = formatDescriptions.first,
                        let asbd = CMAudioFormatDescriptionGetStreamBasicDescription(audioFormatDesc)
                        else { break }
                    
                    let totalSamples = Int((asbd.pointee.mSampleRate) * Float64(asset.duration.value) / Float64(asset.duration.timescale))
                    let audioContext = AudioContext(audioURL: audioURL, totalSamples: totalSamples, asset: asset, assetTrack: assetTrack)
                    completionHandler(audioContext)
                    return
                    
                case .failed, .cancelled, .loading, .unknown:
                    print("Couldn't load asset: \(error?.localizedDescription ?? "Unknown error")")
                }
                
                completionHandler(nil)
            }
        }
    }
    
    let noiseFloor: Float = -80
    
    func render(audioContext: AudioContext?, targetSamples: Int = 100) -> [Float]{
        guard let audioContext = audioContext else {
            fatalError("Couldn't create the audioContext")
        }
        
        let sampleRange: CountableRange = 0..?
            CMBlockBufferGetDataPointer(readBuffer,
                                        atOffset: 0,
                                        lengthAtOffsetOut: &readBufferLength,
                                        totalLengthOut: nil,
                                        dataPointerOut: &readBufferPointer)
            sampleBuffer.append(UnsafeBufferPointer(start: readBufferPointer, count: readBufferLength))
            CMSampleBufferInvalidate(readSampleBuffer)
            
            let totalSamples = sampleBuffer.count / MemoryLayout.size
            let downSampledLength = totalSamples / samplesPerPixel
            let samplesToProcess = downSampledLength * samplesPerPixel
            
            guard samplesToProcess > 0 else { continue }
            
            processSamples(fromData: &sampleBuffer,
                           outputSamples: &outputSamples,
                           samplesToProcess: samplesToProcess,
                           downSampledLength: downSampledLength,
                           samplesPerPixel: samplesPerPixel,
                           filter: filter)
            //print("Status: \(reader.status)")
        }
        
        // Process the remaining samples at the end which didn't fit into samplesPerPixel
        let samplesToProcess = sampleBuffer.count / MemoryLayout.size
        if samplesToProcess > 0 {
            let downSampledLength = 1
            let samplesPerPixel = samplesToProcess
            let filter = [Float](repeating: 1.0 / Float(samplesPerPixel), count: samplesPerPixel)
            
            processSamples(fromData: &sampleBuffer,
                           outputSamples: &outputSamples,
                           samplesToProcess: samplesToProcess,
                           downSampledLength: downSampledLength,
                           samplesPerPixel: samplesPerPixel,
                           filter: filter)
            //print("Status: \(reader.status)")
        }
        
        // if (reader.status == AVAssetReaderStatusFailed || reader.status == AVAssetReaderStatusUnknown)
        guard reader.status == .completed else {
            fatalError("Couldn't read the audio file")
        }
        
        return outputSamples
    }
    
    func processSamples(fromData sampleBuffer: inout Data,
                        outputSamples: inout [Float],
                        samplesToProcess: Int,
                        downSampledLength: Int,
                        samplesPerPixel: Int,
                        filter: [Float]) {
        
        sampleBuffer.withUnsafeBytes { (samples: UnsafeRawBufferPointer) in
            var processingBuffer = [Float](repeating: 0.0, count: samplesToProcess)
            
            let sampleCount = vDSP_Length(samplesToProcess)
            
            //Create an UnsafePointer from samples
            let unsafeBufferPointer = samples.bindMemory(to: Int16.self)
            let unsafePointer = unsafeBufferPointer.baseAddress!
            
            //Convert 16bit int samples to floats
            vDSP_vflt16(unsafePointer, 1, &processingBuffer, 1, sampleCount)
            
            //Take the absolute values to get amplitude
            vDSP_vabs(processingBuffer, 1, &processingBuffer, 1, sampleCount)
            
            //get the corresponding dB, and clip the results
            getdB(from: &processingBuffer)
            
            //Downsample and average
            var downSampledData = [Float](repeating: 0.0, count: downSampledLength)
            vDSP_desamp(processingBuffer,
                        vDSP_Stride(samplesPerPixel),
                        filter, &downSampledData,
                        vDSP_Length(downSampledLength),
                        vDSP_Length(samplesPerPixel))
            
            //Remove processed samples
            sampleBuffer.removeFirst(samplesToProcess * MemoryLayout.size)
            
            outputSamples += downSampledData
        }
    }
    
    func getdB(from normalizedSamples: inout [Float]) {
        // Convert samples to a log scale
        var zero: Float = 32768.0
        vDSP_vdbcon(normalizedSamples, 1, &zero, &normalizedSamples, 1, vDSP_Length(normalizedSamples.count), 1)
        
        //Clip to [noiseFloor, 0]
        var ceil: Float = 0.0
        var noiseFloorMutable = noiseFloor
        vDSP_vclip(normalizedSamples, 1, &noiseFloorMutable, &ceil, &normalizedSamples, 1, vDSP_Length(normalizedSamples.count))
    }
    

    Old solution

    Here is a function you could use to pre-render the meter levels of an audio file without playing it:

    func averagePowers(audioFileURL: URL, forChannel channelNumber: Int, completionHandler: @escaping(_ success: [Float]) -> ()) {
        let audioFile = try! AVAudioFile(forReading: audioFileURL)
        let audioFilePFormat = audioFile.processingFormat
        let audioFileLength = audioFile.length
        
        //Set the size of frames to read from the audio file, you can adjust this to your liking
        let frameSizeToRead = Int(audioFilePFormat.sampleRate/20)
        
        //This is to how many frames/portions we're going to divide the audio file
        let numberOfFrames = Int(audioFileLength)/frameSizeToRead
        
        //Create a pcm buffer the size of a frame
        guard let audioBuffer = AVAudioPCMBuffer(pcmFormat: audioFilePFormat, frameCapacity: AVAudioFrameCount(frameSizeToRead)) else {
            fatalError("Couldn't create the audio buffer")
        }
        
        //Do the calculations in a background thread, if you don't want to block the main thread for larger audio files
        DispatchQueue.global(qos: .userInitiated).async {
            
            //This is the array to be returned
            var returnArray : [Float] = [Float]()
            
            //We're going to read the audio file, frame by frame
            for i in 0.. 0.000_000_01 ? 20 * log10(meanValue) : -160.0
                
                //append the db power in the current frame to the returnArray
                returnArray.append(dbPower)
            }
            
            //Return the dBPowers
            completionHandler(returnArray)
        }
    }
    

    And you can call it like so:

    let path = Bundle.main.path(forResource: "audio.mp3", ofType:nil)!
    let url = URL(fileURLWithPath: path)
    averagePowers(audioFileURL: url, forChannel: 0, completionHandler: { array in
        //Use the array
    })
    

    Using instruments, this solution makes high cpu usage during 1.2 seconds, takes about 5 seconds to return to the main thread with the returnArray, and up to 10 seconds when on low battery mode.

提交回复
热议问题