I need to extract audio meter levels from a file so I can render the levels before playing the audio. I know AVAudioPlayer
can get this information while playing th
It takes on an iPhone:
0.538s to process an 8MByte
mp3 player with a 4min47s
duration, and 44,100
sampling rate
0.170s to process an 712KByte
mp3 player with a 22s
duration, and 44,100
sampling rate
0.089s to process caf
file created by converting the file above using this command afconvert -f caff -d LEI16 audio.mp3 audio.caf
in the terminal.
Let's begin:
A) Declare this class that is going to hold the necessary information about the audio asset:
/// Holds audio information used for building waveforms
final class AudioContext {
/// The audio asset URL used to load the context
public let audioURL: URL
/// Total number of samples in loaded asset
public let totalSamples: Int
/// Loaded asset
public let asset: AVAsset
// Loaded assetTrack
public let assetTrack: AVAssetTrack
private init(audioURL: URL, totalSamples: Int, asset: AVAsset, assetTrack: AVAssetTrack) {
self.audioURL = audioURL
self.totalSamples = totalSamples
self.asset = asset
self.assetTrack = assetTrack
}
public static func load(fromAudioURL audioURL: URL, completionHandler: @escaping (_ audioContext: AudioContext?) -> ()) {
let asset = AVURLAsset(url: audioURL, options: [AVURLAssetPreferPreciseDurationAndTimingKey: NSNumber(value: true as Bool)])
guard let assetTrack = asset.tracks(withMediaType: AVMediaType.audio).first else {
fatalError("Couldn't load AVAssetTrack")
}
asset.loadValuesAsynchronously(forKeys: ["duration"]) {
var error: NSError?
let status = asset.statusOfValue(forKey: "duration", error: &error)
switch status {
case .loaded:
guard
let formatDescriptions = assetTrack.formatDescriptions as? [CMAudioFormatDescription],
let audioFormatDesc = formatDescriptions.first,
let asbd = CMAudioFormatDescriptionGetStreamBasicDescription(audioFormatDesc)
else { break }
let totalSamples = Int((asbd.pointee.mSampleRate) * Float64(asset.duration.value) / Float64(asset.duration.timescale))
let audioContext = AudioContext(audioURL: audioURL, totalSamples: totalSamples, asset: asset, assetTrack: assetTrack)
completionHandler(audioContext)
return
case .failed, .cancelled, .loading, .unknown:
print("Couldn't load asset: \(error?.localizedDescription ?? "Unknown error")")
}
completionHandler(nil)
}
}
}
We are going to use its asynchronous function load
, and handle its result to a completion handler.
B) Import AVFoundation
and Accelerate
in your view controller:
import AVFoundation
import Accelerate
C) Declare the noise level in your view controller (in dB):
let noiseFloor: Float = -80
For example, anything less than -80dB
will be considered as silence.
D) The following function takes an audio context and produces the desired dB powers. targetSamples
is by default set to 100, you can change that to suit your UI needs:
func render(audioContext: AudioContext?, targetSamples: Int = 100) -> [Float]{
guard let audioContext = audioContext else {
fatalError("Couldn't create the audioContext")
}
let sampleRange: CountableRange = 0..?
CMBlockBufferGetDataPointer(readBuffer, 0, &readBufferLength, nil, &readBufferPointer)
sampleBuffer.append(UnsafeBufferPointer(start: readBufferPointer, count: readBufferLength))
CMSampleBufferInvalidate(readSampleBuffer)
let totalSamples = sampleBuffer.count / MemoryLayout.size
let downSampledLength = totalSamples / samplesPerPixel
let samplesToProcess = downSampledLength * samplesPerPixel
guard samplesToProcess > 0 else { continue }
processSamples(fromData: &sampleBuffer,
outputSamples: &outputSamples,
samplesToProcess: samplesToProcess,
downSampledLength: downSampledLength,
samplesPerPixel: samplesPerPixel,
filter: filter)
//print("Status: \(reader.status)")
}
// Process the remaining samples at the end which didn't fit into samplesPerPixel
let samplesToProcess = sampleBuffer.count / MemoryLayout.size
if samplesToProcess > 0 {
let downSampledLength = 1
let samplesPerPixel = samplesToProcess
let filter = [Float](repeating: 1.0 / Float(samplesPerPixel), count: samplesPerPixel)
processSamples(fromData: &sampleBuffer,
outputSamples: &outputSamples,
samplesToProcess: samplesToProcess,
downSampledLength: downSampledLength,
samplesPerPixel: samplesPerPixel,
filter: filter)
//print("Status: \(reader.status)")
}
// if (reader.status == AVAssetReaderStatusFailed || reader.status == AVAssetReaderStatusUnknown)
guard reader.status == .completed else {
fatalError("Couldn't read the audio file")
}
return outputSamples
}
E) render
uses this function to down-sample the data from the audio file, and convert to decibels:
func processSamples(fromData sampleBuffer: inout Data,
outputSamples: inout [Float],
samplesToProcess: Int,
downSampledLength: Int,
samplesPerPixel: Int,
filter: [Float]) {
sampleBuffer.withUnsafeBytes { (samples: UnsafePointer) in
var processingBuffer = [Float](repeating: 0.0, count: samplesToProcess)
let sampleCount = vDSP_Length(samplesToProcess)
//Convert 16bit int samples to floats
vDSP_vflt16(samples, 1, &processingBuffer, 1, sampleCount)
//Take the absolute values to get amplitude
vDSP_vabs(processingBuffer, 1, &processingBuffer, 1, sampleCount)
//get the corresponding dB, and clip the results
getdB(from: &processingBuffer)
//Downsample and average
var downSampledData = [Float](repeating: 0.0, count: downSampledLength)
vDSP_desamp(processingBuffer,
vDSP_Stride(samplesPerPixel),
filter, &downSampledData,
vDSP_Length(downSampledLength),
vDSP_Length(samplesPerPixel))
//Remove processed samples
sampleBuffer.removeFirst(samplesToProcess * MemoryLayout.size)
outputSamples += downSampledData
}
}
F) Which in turn calls this function that gets the corresponding dB, and clips the results to [noiseFloor, 0]
:
func getdB(from normalizedSamples: inout [Float]) {
// Convert samples to a log scale
var zero: Float = 32768.0
vDSP_vdbcon(normalizedSamples, 1, &zero, &normalizedSamples, 1, vDSP_Length(normalizedSamples.count), 1)
//Clip to [noiseFloor, 0]
var ceil: Float = 0.0
var noiseFloorMutable = noiseFloor
vDSP_vclip(normalizedSamples, 1, &noiseFloorMutable, &ceil, &normalizedSamples, 1, vDSP_Length(normalizedSamples.count))
}
G) Finally you can get the waveform of the audio like so:
guard let path = Bundle.main.path(forResource: "audio", ofType:"mp3") else {
fatalError("Couldn't find the file path")
}
let url = URL(fileURLWithPath: path)
var outputArray : [Float] = []
AudioContext.load(fromAudioURL: url, completionHandler: { audioContext in
guard let audioContext = audioContext else {
fatalError("Couldn't create the audioContext")
}
outputArray = self.render(audioContext: audioContext, targetSamples: 300)
})
Don't forget that AudioContext.load(fromAudioURL:)
is asynchronous.
This solution is synthesized from this repo by William Entriken. All credit goes to him.
Here is the same code updated to Swift 5 syntax:
import AVFoundation
import Accelerate
/// Holds audio information used for building waveforms
final class AudioContext {
/// The audio asset URL used to load the context
public let audioURL: URL
/// Total number of samples in loaded asset
public let totalSamples: Int
/// Loaded asset
public let asset: AVAsset
// Loaded assetTrack
public let assetTrack: AVAssetTrack
private init(audioURL: URL, totalSamples: Int, asset: AVAsset, assetTrack: AVAssetTrack) {
self.audioURL = audioURL
self.totalSamples = totalSamples
self.asset = asset
self.assetTrack = assetTrack
}
public static func load(fromAudioURL audioURL: URL, completionHandler: @escaping (_ audioContext: AudioContext?) -> ()) {
let asset = AVURLAsset(url: audioURL, options: [AVURLAssetPreferPreciseDurationAndTimingKey: NSNumber(value: true as Bool)])
guard let assetTrack = asset.tracks(withMediaType: AVMediaType.audio).first else {
fatalError("Couldn't load AVAssetTrack")
}
asset.loadValuesAsynchronously(forKeys: ["duration"]) {
var error: NSError?
let status = asset.statusOfValue(forKey: "duration", error: &error)
switch status {
case .loaded:
guard
let formatDescriptions = assetTrack.formatDescriptions as? [CMAudioFormatDescription],
let audioFormatDesc = formatDescriptions.first,
let asbd = CMAudioFormatDescriptionGetStreamBasicDescription(audioFormatDesc)
else { break }
let totalSamples = Int((asbd.pointee.mSampleRate) * Float64(asset.duration.value) / Float64(asset.duration.timescale))
let audioContext = AudioContext(audioURL: audioURL, totalSamples: totalSamples, asset: asset, assetTrack: assetTrack)
completionHandler(audioContext)
return
case .failed, .cancelled, .loading, .unknown:
print("Couldn't load asset: \(error?.localizedDescription ?? "Unknown error")")
}
completionHandler(nil)
}
}
}
let noiseFloor: Float = -80
func render(audioContext: AudioContext?, targetSamples: Int = 100) -> [Float]{
guard let audioContext = audioContext else {
fatalError("Couldn't create the audioContext")
}
let sampleRange: CountableRange = 0..?
CMBlockBufferGetDataPointer(readBuffer,
atOffset: 0,
lengthAtOffsetOut: &readBufferLength,
totalLengthOut: nil,
dataPointerOut: &readBufferPointer)
sampleBuffer.append(UnsafeBufferPointer(start: readBufferPointer, count: readBufferLength))
CMSampleBufferInvalidate(readSampleBuffer)
let totalSamples = sampleBuffer.count / MemoryLayout.size
let downSampledLength = totalSamples / samplesPerPixel
let samplesToProcess = downSampledLength * samplesPerPixel
guard samplesToProcess > 0 else { continue }
processSamples(fromData: &sampleBuffer,
outputSamples: &outputSamples,
samplesToProcess: samplesToProcess,
downSampledLength: downSampledLength,
samplesPerPixel: samplesPerPixel,
filter: filter)
//print("Status: \(reader.status)")
}
// Process the remaining samples at the end which didn't fit into samplesPerPixel
let samplesToProcess = sampleBuffer.count / MemoryLayout.size
if samplesToProcess > 0 {
let downSampledLength = 1
let samplesPerPixel = samplesToProcess
let filter = [Float](repeating: 1.0 / Float(samplesPerPixel), count: samplesPerPixel)
processSamples(fromData: &sampleBuffer,
outputSamples: &outputSamples,
samplesToProcess: samplesToProcess,
downSampledLength: downSampledLength,
samplesPerPixel: samplesPerPixel,
filter: filter)
//print("Status: \(reader.status)")
}
// if (reader.status == AVAssetReaderStatusFailed || reader.status == AVAssetReaderStatusUnknown)
guard reader.status == .completed else {
fatalError("Couldn't read the audio file")
}
return outputSamples
}
func processSamples(fromData sampleBuffer: inout Data,
outputSamples: inout [Float],
samplesToProcess: Int,
downSampledLength: Int,
samplesPerPixel: Int,
filter: [Float]) {
sampleBuffer.withUnsafeBytes { (samples: UnsafeRawBufferPointer) in
var processingBuffer = [Float](repeating: 0.0, count: samplesToProcess)
let sampleCount = vDSP_Length(samplesToProcess)
//Create an UnsafePointer from samples
let unsafeBufferPointer = samples.bindMemory(to: Int16.self)
let unsafePointer = unsafeBufferPointer.baseAddress!
//Convert 16bit int samples to floats
vDSP_vflt16(unsafePointer, 1, &processingBuffer, 1, sampleCount)
//Take the absolute values to get amplitude
vDSP_vabs(processingBuffer, 1, &processingBuffer, 1, sampleCount)
//get the corresponding dB, and clip the results
getdB(from: &processingBuffer)
//Downsample and average
var downSampledData = [Float](repeating: 0.0, count: downSampledLength)
vDSP_desamp(processingBuffer,
vDSP_Stride(samplesPerPixel),
filter, &downSampledData,
vDSP_Length(downSampledLength),
vDSP_Length(samplesPerPixel))
//Remove processed samples
sampleBuffer.removeFirst(samplesToProcess * MemoryLayout.size)
outputSamples += downSampledData
}
}
func getdB(from normalizedSamples: inout [Float]) {
// Convert samples to a log scale
var zero: Float = 32768.0
vDSP_vdbcon(normalizedSamples, 1, &zero, &normalizedSamples, 1, vDSP_Length(normalizedSamples.count), 1)
//Clip to [noiseFloor, 0]
var ceil: Float = 0.0
var noiseFloorMutable = noiseFloor
vDSP_vclip(normalizedSamples, 1, &noiseFloorMutable, &ceil, &normalizedSamples, 1, vDSP_Length(normalizedSamples.count))
}
Here is a function you could use to pre-render the meter levels of an audio file without playing it:
func averagePowers(audioFileURL: URL, forChannel channelNumber: Int, completionHandler: @escaping(_ success: [Float]) -> ()) {
let audioFile = try! AVAudioFile(forReading: audioFileURL)
let audioFilePFormat = audioFile.processingFormat
let audioFileLength = audioFile.length
//Set the size of frames to read from the audio file, you can adjust this to your liking
let frameSizeToRead = Int(audioFilePFormat.sampleRate/20)
//This is to how many frames/portions we're going to divide the audio file
let numberOfFrames = Int(audioFileLength)/frameSizeToRead
//Create a pcm buffer the size of a frame
guard let audioBuffer = AVAudioPCMBuffer(pcmFormat: audioFilePFormat, frameCapacity: AVAudioFrameCount(frameSizeToRead)) else {
fatalError("Couldn't create the audio buffer")
}
//Do the calculations in a background thread, if you don't want to block the main thread for larger audio files
DispatchQueue.global(qos: .userInitiated).async {
//This is the array to be returned
var returnArray : [Float] = [Float]()
//We're going to read the audio file, frame by frame
for i in 0.. 0.000_000_01 ? 20 * log10(meanValue) : -160.0
//append the db power in the current frame to the returnArray
returnArray.append(dbPower)
}
//Return the dBPowers
completionHandler(returnArray)
}
}
And you can call it like so:
let path = Bundle.main.path(forResource: "audio.mp3", ofType:nil)!
let url = URL(fileURLWithPath: path)
averagePowers(audioFileURL: url, forChannel: 0, completionHandler: { array in
//Use the array
})
Using instruments, this solution makes high cpu usage during 1.2 seconds, takes about 5 seconds to return to the main thread with the returnArray
, and up to 10 seconds when on low battery mode.