Extract meter levels from audio file

问题

I need to extract audio meter levels from a file so I can render the levels before playing the audio. I know AVAudioPlayer can get this information while playing the audio file through

func averagePower(forChannel channelNumber: Int) -> Float.

But in my case I would like to obtain an [Float] of meter levels beforehand.

回答1:

Swift 4

It takes on an iPhone:

0.538s to process an 8MByte mp3 player with a 4min47s duration, and 44,100 sampling rate
0.170s to process an 712KByte mp3 player with a 22s duration, and 44,100 sampling rate
0.089s to process caffile created by converting the file above using this command afconvert -f caff -d LEI16 audio.mp3 audio.caf in the terminal.

Let's begin:

A) Declare this class that is going to hold the necessary information about the audio asset:

/// Holds audio information used for building waveforms
final class AudioContext {

    /// The audio asset URL used to load the context
    public let audioURL: URL

    /// Total number of samples in loaded asset
    public let totalSamples: Int

    /// Loaded asset
    public let asset: AVAsset

    // Loaded assetTrack
    public let assetTrack: AVAssetTrack

    private init(audioURL: URL, totalSamples: Int, asset: AVAsset, assetTrack: AVAssetTrack) {
        self.audioURL = audioURL
        self.totalSamples = totalSamples
        self.asset = asset
        self.assetTrack = assetTrack
    }

    public static func load(fromAudioURL audioURL: URL, completionHandler: @escaping (_ audioContext: AudioContext?) -> ()) {
        let asset = AVURLAsset(url: audioURL, options: [AVURLAssetPreferPreciseDurationAndTimingKey: NSNumber(value: true as Bool)])

        guard let assetTrack = asset.tracks(withMediaType: AVMediaType.audio).first else {
            fatalError("Couldn't load AVAssetTrack")
        }

        asset.loadValuesAsynchronously(forKeys: ["duration"]) {
            var error: NSError?
            let status = asset.statusOfValue(forKey: "duration", error: &error)
            switch status {
            case .loaded:
                guard
                    let formatDescriptions = assetTrack.formatDescriptions as? [CMAudioFormatDescription],
                    let audioFormatDesc = formatDescriptions.first,
                    let asbd = CMAudioFormatDescriptionGetStreamBasicDescription(audioFormatDesc)
                    else { break }

                let totalSamples = Int((asbd.pointee.mSampleRate) * Float64(asset.duration.value) / Float64(asset.duration.timescale))
                let audioContext = AudioContext(audioURL: audioURL, totalSamples: totalSamples, asset: asset, assetTrack: assetTrack)
                completionHandler(audioContext)
                return

            case .failed, .cancelled, .loading, .unknown:
                print("Couldn't load asset: \(error?.localizedDescription ?? "Unknown error")")
            }

            completionHandler(nil)
        }
    }
}

We are going to use its asynchronous function load, and handle its result to a completion handler.

B) Import AVFoundation and Accelerate in your view controller:

import AVFoundation
import Accelerate

C) Declare the noise level in your view controller (in dB):

let noiseFloor: Float = -80

For example, anything less than -80dB will be considered as silence.

D) The following function takes an audio context and produces the desired dB powers. targetSamples is by default set to 100, you can change that to suit your UI needs:

func render(audioContext: AudioContext?, targetSamples: Int = 100) -> [Float]{
    guard let audioContext = audioContext else {
        fatalError("Couldn't create the audioContext")
    }

    let sampleRange: CountableRange<Int> = 0..<audioContext.totalSamples/3

    guard let reader = try? AVAssetReader(asset: audioContext.asset)
        else {
            fatalError("Couldn't initialize the AVAssetReader")
    }

    reader.timeRange = CMTimeRange(start: CMTime(value: Int64(sampleRange.lowerBound), timescale: audioContext.asset.duration.timescale),
                                   duration: CMTime(value: Int64(sampleRange.count), timescale: audioContext.asset.duration.timescale))

    let outputSettingsDict: [String : Any] = [
        AVFormatIDKey: Int(kAudioFormatLinearPCM),
        AVLinearPCMBitDepthKey: 16,
        AVLinearPCMIsBigEndianKey: false,
        AVLinearPCMIsFloatKey: false,
        AVLinearPCMIsNonInterleaved: false
    ]

    let readerOutput = AVAssetReaderTrackOutput(track: audioContext.assetTrack,
                                                outputSettings: outputSettingsDict)
    readerOutput.alwaysCopiesSampleData = false
    reader.add(readerOutput)

    var channelCount = 1
    let formatDescriptions = audioContext.assetTrack.formatDescriptions as! [CMAudioFormatDescription]
    for item in formatDescriptions {
        guard let fmtDesc = CMAudioFormatDescriptionGetStreamBasicDescription(item) else {
            fatalError("Couldn't get the format description")
        }
        channelCount = Int(fmtDesc.pointee.mChannelsPerFrame)
    }

    let samplesPerPixel = max(1, channelCount * sampleRange.count / targetSamples)
    let filter = [Float](repeating: 1.0 / Float(samplesPerPixel), count: samplesPerPixel)

    var outputSamples = [Float]()
    var sampleBuffer = Data()

    // 16-bit samples
    reader.startReading()
    defer { reader.cancelReading() }

    while reader.status == .reading {
        guard let readSampleBuffer = readerOutput.copyNextSampleBuffer(),
            let readBuffer = CMSampleBufferGetDataBuffer(readSampleBuffer) else {
                break
        }
        // Append audio sample buffer into our current sample buffer
        var readBufferLength = 0
        var readBufferPointer: UnsafeMutablePointer<Int8>?
        CMBlockBufferGetDataPointer(readBuffer, 0, &readBufferLength, nil, &readBufferPointer)
        sampleBuffer.append(UnsafeBufferPointer(start: readBufferPointer, count: readBufferLength))
        CMSampleBufferInvalidate(readSampleBuffer)

        let totalSamples = sampleBuffer.count / MemoryLayout<Int16>.size
        let downSampledLength = totalSamples / samplesPerPixel
        let samplesToProcess = downSampledLength * samplesPerPixel

        guard samplesToProcess > 0 else { continue }

        processSamples(fromData: &sampleBuffer,
                       outputSamples: &outputSamples,
                       samplesToProcess: samplesToProcess,
                       downSampledLength: downSampledLength,
                       samplesPerPixel: samplesPerPixel,
                       filter: filter)
        //print("Status: \(reader.status)")
    }

    // Process the remaining samples at the end which didn't fit into samplesPerPixel
    let samplesToProcess = sampleBuffer.count / MemoryLayout<Int16>.size
    if samplesToProcess > 0 {
        let downSampledLength = 1
        let samplesPerPixel = samplesToProcess
        let filter = [Float](repeating: 1.0 / Float(samplesPerPixel), count: samplesPerPixel)

        processSamples(fromData: &sampleBuffer,
                       outputSamples: &outputSamples,
                       samplesToProcess: samplesToProcess,
                       downSampledLength: downSampledLength,
                       samplesPerPixel: samplesPerPixel,
                       filter: filter)
        //print("Status: \(reader.status)")
    }

    // if (reader.status == AVAssetReaderStatusFailed || reader.status == AVAssetReaderStatusUnknown)
    guard reader.status == .completed || true else {
        fatalError("Couldn't read the audio file")
    }

    return outputSamples
}

E) render uses this function to down-sample the data from the audio file, and convert to decibels:

func processSamples(fromData sampleBuffer: inout Data,
                    outputSamples: inout [Float],
                    samplesToProcess: Int,
                    downSampledLength: Int,
                    samplesPerPixel: Int,
                    filter: [Float]) {
    sampleBuffer.withUnsafeBytes { (samples: UnsafePointer<Int16>) in
        var processingBuffer = [Float](repeating: 0.0, count: samplesToProcess)

        let sampleCount = vDSP_Length(samplesToProcess)

        //Convert 16bit int samples to floats
        vDSP_vflt16(samples, 1, &processingBuffer, 1, sampleCount)

        //Take the absolute values to get amplitude
        vDSP_vabs(processingBuffer, 1, &processingBuffer, 1, sampleCount)

        //get the corresponding dB, and clip the results
        getdB(from: &processingBuffer)

        //Downsample and average
        var downSampledData = [Float](repeating: 0.0, count: downSampledLength)
        vDSP_desamp(processingBuffer,
                    vDSP_Stride(samplesPerPixel),
                    filter, &downSampledData,
                    vDSP_Length(downSampledLength),
                    vDSP_Length(samplesPerPixel))

        //Remove processed samples
        sampleBuffer.removeFirst(samplesToProcess * MemoryLayout<Int16>.size)

        outputSamples += downSampledData
    }
}

F) Which in turn calls this function that gets the corresponding dB, and clips the results to [noiseFloor, 0]:

func getdB(from normalizedSamples: inout [Float]) {
    // Convert samples to a log scale
    var zero: Float = 32768.0
    vDSP_vdbcon(normalizedSamples, 1, &zero, &normalizedSamples, 1, vDSP_Length(normalizedSamples.count), 1)

    //Clip to [noiseFloor, 0]
    var ceil: Float = 0.0
    var noiseFloorMutable = noiseFloor
    vDSP_vclip(normalizedSamples, 1, &noiseFloorMutable, &ceil, &normalizedSamples, 1, vDSP_Length(normalizedSamples.count))
}

G) Finally you can get the waveform of the audio like so:

guard let path = Bundle.main.path(forResource: "audio", ofType:"mp3") else {
    fatalError("Couldn't find the file path")
}
let url = URL(fileURLWithPath: path)
var outputArray : [Float] = []
AudioContext.load(fromAudioURL: url, completionHandler: { audioContext in
    guard let audioContext = audioContext else {
        fatalError("Couldn't create the audioContext")
    }
    outputArray = self.render(audioContext: audioContext, targetSamples: 300)
})

Don't forget that AudioContext.load(fromAudioURL:) is asynchronous.

This solution is synthesized from this repo by William Entriken. All credit goes to him.

Swift 5

Here is the same code updated to Swift 5 syntax:

import AVFoundation
import Accelerate

/// Holds audio information used for building waveforms
final class AudioContext {

    /// The audio asset URL used to load the context
    public let audioURL: URL

    /// Total number of samples in loaded asset
    public let totalSamples: Int

    /// Loaded asset
    public let asset: AVAsset

    // Loaded assetTrack
    public let assetTrack: AVAssetTrack

    private init(audioURL: URL, totalSamples: Int, asset: AVAsset, assetTrack: AVAssetTrack) {
        self.audioURL = audioURL
        self.totalSamples = totalSamples
        self.asset = asset
        self.assetTrack = assetTrack
    }

    public static func load(fromAudioURL audioURL: URL, completionHandler: @escaping (_ audioContext: AudioContext?) -> ()) {
        let asset = AVURLAsset(url: audioURL, options: [AVURLAssetPreferPreciseDurationAndTimingKey: NSNumber(value: true as Bool)])

        guard let assetTrack = asset.tracks(withMediaType: AVMediaType.audio).first else {
            fatalError("Couldn't load AVAssetTrack")
        }

        asset.loadValuesAsynchronously(forKeys: ["duration"]) {
            var error: NSError?
            let status = asset.statusOfValue(forKey: "duration", error: &error)
            switch status {
            case .loaded:
                guard
                    let formatDescriptions = assetTrack.formatDescriptions as? [CMAudioFormatDescription],
                    let audioFormatDesc = formatDescriptions.first,
                    let asbd = CMAudioFormatDescriptionGetStreamBasicDescription(audioFormatDesc)
                    else { break }

                let totalSamples = Int((asbd.pointee.mSampleRate) * Float64(asset.duration.value) / Float64(asset.duration.timescale))
                let audioContext = AudioContext(audioURL: audioURL, totalSamples: totalSamples, asset: asset, assetTrack: assetTrack)
                completionHandler(audioContext)
                return

            case .failed, .cancelled, .loading, .unknown:
                print("Couldn't load asset: \(error?.localizedDescription ?? "Unknown error")")
            }

            completionHandler(nil)
        }
    }
}

let noiseFloor: Float = -80

func render(audioContext: AudioContext?, targetSamples: Int = 100) -> [Float]{
    guard let audioContext = audioContext else {
        fatalError("Couldn't create the audioContext")
    }

    let sampleRange: CountableRange<Int> = 0..<audioContext.totalSamples/3

    guard let reader = try? AVAssetReader(asset: audioContext.asset)
        else {
            fatalError("Couldn't initialize the AVAssetReader")
    }

    reader.timeRange = CMTimeRange(start: CMTime(value: Int64(sampleRange.lowerBound), timescale: audioContext.asset.duration.timescale),
                                   duration: CMTime(value: Int64(sampleRange.count), timescale: audioContext.asset.duration.timescale))

    let outputSettingsDict: [String : Any] = [
        AVFormatIDKey: Int(kAudioFormatLinearPCM),
        AVLinearPCMBitDepthKey: 16,
        AVLinearPCMIsBigEndianKey: false,
        AVLinearPCMIsFloatKey: false,
        AVLinearPCMIsNonInterleaved: false
    ]

    let readerOutput = AVAssetReaderTrackOutput(track: audioContext.assetTrack,
                                                outputSettings: outputSettingsDict)
    readerOutput.alwaysCopiesSampleData = false
    reader.add(readerOutput)

    var channelCount = 1
    let formatDescriptions = audioContext.assetTrack.formatDescriptions as! [CMAudioFormatDescription]
    for item in formatDescriptions {
        guard let fmtDesc = CMAudioFormatDescriptionGetStreamBasicDescription(item) else {
            fatalError("Couldn't get the format description")
        }
        channelCount = Int(fmtDesc.pointee.mChannelsPerFrame)
    }

    let samplesPerPixel = max(1, channelCount * sampleRange.count / targetSamples)
    let filter = [Float](repeating: 1.0 / Float(samplesPerPixel), count: samplesPerPixel)

    var outputSamples = [Float]()
    var sampleBuffer = Data()

    // 16-bit samples
    reader.startReading()
    defer { reader.cancelReading() }

    while reader.status == .reading {
        guard let readSampleBuffer = readerOutput.copyNextSampleBuffer(),
            let readBuffer = CMSampleBufferGetDataBuffer(readSampleBuffer) else {
                break
        }
        // Append audio sample buffer into our current sample buffer
        var readBufferLength = 0
        var readBufferPointer: UnsafeMutablePointer<Int8>?
        CMBlockBufferGetDataPointer(readBuffer,
                                    atOffset: 0,
                                    lengthAtOffsetOut: &readBufferLength,
                                    totalLengthOut: nil,
                                    dataPointerOut: &readBufferPointer)
        sampleBuffer.append(UnsafeBufferPointer(start: readBufferPointer, count: readBufferLength))
        CMSampleBufferInvalidate(readSampleBuffer)

        let totalSamples = sampleBuffer.count / MemoryLayout<Int16>.size
        let downSampledLength = totalSamples / samplesPerPixel
        let samplesToProcess = downSampledLength * samplesPerPixel

        guard samplesToProcess > 0 else { continue }

        processSamples(fromData: &sampleBuffer,
                       outputSamples: &outputSamples,
                       samplesToProcess: samplesToProcess,
                       downSampledLength: downSampledLength,
                       samplesPerPixel: samplesPerPixel,
                       filter: filter)
        //print("Status: \(reader.status)")
    }

    // Process the remaining samples at the end which didn't fit into samplesPerPixel
    let samplesToProcess = sampleBuffer.count / MemoryLayout<Int16>.size
    if samplesToProcess > 0 {
        let downSampledLength = 1
        let samplesPerPixel = samplesToProcess
        let filter = [Float](repeating: 1.0 / Float(samplesPerPixel), count: samplesPerPixel)

        processSamples(fromData: &sampleBuffer,
                       outputSamples: &outputSamples,
                       samplesToProcess: samplesToProcess,
                       downSampledLength: downSampledLength,
                       samplesPerPixel: samplesPerPixel,
                       filter: filter)
        //print("Status: \(reader.status)")
    }

    // if (reader.status == AVAssetReaderStatusFailed || reader.status == AVAssetReaderStatusUnknown)
    guard reader.status == .completed || true else {
        fatalError("Couldn't read the audio file")
    }

    return outputSamples
}

func processSamples(fromData sampleBuffer: inout Data,
                    outputSamples: inout [Float],
                    samplesToProcess: Int,
                    downSampledLength: Int,
                    samplesPerPixel: Int,
                    filter: [Float]) {

    sampleBuffer.withUnsafeBytes { (samples: UnsafeRawBufferPointer) in
        var processingBuffer = [Float](repeating: 0.0, count: samplesToProcess)

        let sampleCount = vDSP_Length(samplesToProcess)

        //Create an UnsafePointer<Int16> from samples
        let unsafeBufferPointer = samples.bindMemory(to: Int16.self)
        let unsafePointer = unsafeBufferPointer.baseAddress!

        //Convert 16bit int samples to floats
        vDSP_vflt16(unsafePointer, 1, &processingBuffer, 1, sampleCount)

        //Take the absolute values to get amplitude
        vDSP_vabs(processingBuffer, 1, &processingBuffer, 1, sampleCount)

        //get the corresponding dB, and clip the results
        getdB(from: &processingBuffer)

        //Downsample and average
        var downSampledData = [Float](repeating: 0.0, count: downSampledLength)
        vDSP_desamp(processingBuffer,
                    vDSP_Stride(samplesPerPixel),
                    filter, &downSampledData,
                    vDSP_Length(downSampledLength),
                    vDSP_Length(samplesPerPixel))

        //Remove processed samples
        sampleBuffer.removeFirst(samplesToProcess * MemoryLayout<Int16>.size)

        outputSamples += downSampledData
    }
}

func getdB(from normalizedSamples: inout [Float]) {
    // Convert samples to a log scale
    var zero: Float = 32768.0
    vDSP_vdbcon(normalizedSamples, 1, &zero, &normalizedSamples, 1, vDSP_Length(normalizedSamples.count), 1)

    //Clip to [noiseFloor, 0]
    var ceil: Float = 0.0
    var noiseFloorMutable = noiseFloor
    vDSP_vclip(normalizedSamples, 1, &noiseFloorMutable, &ceil, &normalizedSamples, 1, vDSP_Length(normalizedSamples.count))
}

Old solution

Here is a function you could use to pre-render the meter levels of an audio file without playing it:

func averagePowers(audioFileURL: URL, forChannel channelNumber: Int, completionHandler: @escaping(_ success: [Float]) -> ()) {
    let audioFile = try! AVAudioFile(forReading: audioFileURL)
    let audioFilePFormat = audioFile.processingFormat
    let audioFileLength = audioFile.length

    //Set the size of frames to read from the audio file, you can adjust this to your liking
    let frameSizeToRead = Int(audioFilePFormat.sampleRate/20)

    //This is to how many frames/portions we're going to divide the audio file
    let numberOfFrames = Int(audioFileLength)/frameSizeToRead

    //Create a pcm buffer the size of a frame
    guard let audioBuffer = AVAudioPCMBuffer(pcmFormat: audioFilePFormat, frameCapacity: AVAudioFrameCount(frameSizeToRead)) else {
        fatalError("Couldn't create the audio buffer")
    }

    //Do the calculations in a background thread, if you don't want to block the main thread for larger audio files
    DispatchQueue.global(qos: .userInitiated).async {

        //This is the array to be returned
        var returnArray : [Float] = [Float]()

        //We're going to read the audio file, frame by frame
        for i in 0..<numberOfFrames {

            //Change the position from which we are reading the audio file, since each frame starts from a different position in the audio file
            audioFile.framePosition = AVAudioFramePosition(i * frameSizeToRead)

            //Read the frame from the audio file
            try! audioFile.read(into: audioBuffer, frameCount: AVAudioFrameCount(frameSizeToRead))

            //Get the data from the chosen channel
            let channelData = audioBuffer.floatChannelData![channelNumber]

            //This is the array of floats
            let arr = Array(UnsafeBufferPointer(start:channelData, count: frameSizeToRead))

            //Calculate the mean value of the absolute values
            let meanValue = arr.reduce(0, {$0 + abs($1)})/Float(arr.count)

            //Calculate the dB power (You can adjust this), if average is less than 0.000_000_01 we limit it to -160.0
            let dbPower: Float = meanValue > 0.000_000_01 ? 20 * log10(meanValue) : -160.0

            //append the db power in the current frame to the returnArray
            returnArray.append(dbPower)
        }

        //Return the dBPowers
        completionHandler(returnArray)
    }
}

And you can call it like so:

let path = Bundle.main.path(forResource: "audio.mp3", ofType:nil)!
let url = URL(fileURLWithPath: path)
averagePowers(audioFileURL: url, forChannel: 0, completionHandler: { array in
    //Use the array
})

Using instruments, this solution makes high cpu usage during 1.2 seconds, takes about 5 seconds to return to the main thread with the returnArray, and up to 10 seconds when on low battery mode.

回答2:

First of all, this is heavy operation, so it will take some OS time and resources to accomplish this. In below example I will use standard frame rates and sampling, but you should really sample far far less if you for example only want to display bars as an indications

OK so you don't need to play sound to analyze it. So in this i will not use AVAudioPlayer at all I assume that I will take track as URL:

    let path = Bundle.main.path(forResource: "example3.mp3", ofType:nil)!
    let url = URL(fileURLWithPath: path)

Then I will use AVAudioFile to get track information into AVAudioPCMBuffer. Whenever you have it in buffer you have all information regarding your track:

func buffer(url: URL) {
    do {
        let track = try AVAudioFile(forReading: url)
        let format = AVAudioFormat(commonFormat:.pcmFormatFloat32, sampleRate:track.fileFormat.sampleRate, channels: track.fileFormat.channelCount,  interleaved: false)
        let buffer = AVAudioPCMBuffer(pcmFormat: format!, frameCapacity: UInt32(track.length))!
        try track.read(into : buffer, frameCount:UInt32(track.length))
        self.analyze(buffer: buffer)
    } catch {
        print(error)
    }
}

As you may notice there is analyze method for it. You should have close to floatChannelData variable in your buffer. It's a plain data so you'll need to parse it. I will post a method and below explain this:

func analyze(buffer: AVAudioPCMBuffer) {
    let channelCount = Int(buffer.format.channelCount)
    let frameLength = Int(buffer.frameLength)
    var result = Array(repeating: [Float](repeatElement(0, count: frameLength)), count: channelCount)
    for channel in 0..<channelCount {
        for sampleIndex in 0..<frameLength {
            let sqrtV = sqrt(buffer.floatChannelData![channel][sampleIndex*buffer.stride]/Float(buffer.frameLength))
            let dbPower = 20 * log10(sqrtV)
            result[channel][sampleIndex] = dbPower
        }
    }
}

There are some calculations (heavy one) involved in it. When I was working on similar solutions couple of moths ago I came across this tutorial: https://www.raywenderlich.com/5154-avaudioengine-tutorial-for-ios-getting-started there is excelent explanation of this calculation there and also parts of the code that I pasted above and also use in my project, so I want to credit author here: Scott McAlister 👏

来源：https://stackoverflow.com/questions/51706314/extract-meter-levels-from-audio-file

标签

ios

swift

audio

avaudioplayer

audiotoolbox