Translate audio from speaker output in python with azureSDK

问题

I'm looking to make an application, who woul'd let me translate any audio going out of the speaker in live stream. This way, i will be able to translate any videoconference from any live stream app(youtube,teams,zoom,etc.). I'm not far from a solution, but not there yet.

Src language would be: fr-CA or en-US Dst language would be : fr-Ca or en-US

I was able to get audio stream back from speaker with a custom version of pyaudio allowing loopback with the WASAPI of windows.(https://github.com/intxcc/pyaudio_portaudio)

The next step is to shoot the stream in realtime to Azure translate api in the speechsdk.

so far, the part of getting the stream from speakers is working, but when i plug it with azure, i don't have any error, but it doesn't return any result either. In fact, every like 30 second i recive a reason=ResultReason.NoMatch or a bribe of text that make no sens.

My first though is that the stream byte coming from speaker witch is 48khz, 2 channels is not supported by Azure stream.(i think i read somewhere on the net that it support only 16khz 1 channel, but i'm not sure). And if that so, i found a way to split split two channel into 1, but i don't know how to drop from 48khz to 16khz from a chunk of bytes in realtime..

Any help would be appreciated! Thanks. Here my code:

import time
import azure.cognitiveservices.speech as speechsdk
import pyaudio
import numpy as np
speech_key, service_region = "", "westus"
finalResultSRC = ""
finalResultDst = ""

RATE = 48000
KHz_RATE = int(RATE/1000)
CHUNK = int(RATE)


def translation_continuous():
    """performs continuous speech translation from input from an audio file"""
    # <TranslationContinuous>
    # set up translation parameters: source language and target languages
    translation_config = speechsdk.translation.SpeechTranslationConfig(
        subscription=speech_key, region=service_region,
        speech_recognition_language='fr-CA')

    # setup the audio stream
    audioFormat = speechsdk.audio.AudioStreamFormat(
        samples_per_second=KHz_RATE, bits_per_sample=16, channels=2)
    stream = speechsdk.audio.PushAudioInputStream(audioFormat)

    translation_config.add_target_language("en-US")
    stream = speechsdk.audio.PushAudioInputStream()
    audio_config = speechsdk.audio.AudioConfig(stream=stream)

    # Creates a translation recognizer using and audio file as input.
    recognizer = speechsdk.translation.TranslationRecognizer(
        translation_config=translation_config, audio_config=audio_config)

    def result_callback(event_type, evt):
        """callback to display a translation result"""
        # print("{}: {}\n\tTranslations: {}\n\tResult Json: {}".format(
        # event_type, evt, evt.result.translations.items(), evt.result.json))
        print(evt)
        if event_type == "RECOGNIZING":
            # Translate
            print(evt.result.translations.items()[0][1])
            # Original
            # print(type(evt.result.json))

    done = False

    def stop_cb(evt):
        """callback that signals to stop continuous recognition upon receiving an event `evt`"""
        print('CLOSING on {}'.format(evt))
        nonlocal done
        done = True

    # connect callback functions to the events fired by the recognizer
    recognizer.session_started.connect(
        lambda evt: print('SESSION STARTED: {}'.format(evt)))
    recognizer.session_stopped.connect(
        lambda evt: print('SESSION STOPPED {}'.format(evt)))
    # event for intermediate results
    recognizer.recognizing.connect(
        lambda evt: result_callback('RECOGNIZING', evt))
    # event for final result
    recognizer.recognized.connect(
        lambda evt: result_callback('RECOGNIZED', evt))
    # cancellation event
    recognizer.canceled.connect(lambda evt: print(
        'CANCELED: {} ({})'.format(evt, evt.reason)))

    # stop continuous recognition on either session stopped or canceled events
    recognizer.session_stopped.connect(stop_cb)
    recognizer.canceled.connect(stop_cb)

    def synthesis_callback(evt):
        """
        callback for the synthesis event
        """
        print('SYNTHESIZING {}\n\treceived {} bytes of audio. Reason: {}'.format(
            evt, len(evt.result.audio), evt.result.reason))

    # connect callback to the synthesis event
    recognizer.synthesizing.connect(synthesis_callback)

    # start translation
    recognizer.start_continuous_recognition()
    # start pushing data until all data has been read from the file
    try:
        p = pyaudio.PyAudio()
        pstream = p.open(
            format=pyaudio.paInt16,
            channels=2, rate=RATE,
            input=True, frames_per_buffer=CHUNK,
            input_device_index=5,
            as_loopback=True
        )
        while(True):
            frame = pstream.read(CHUNK)

            #frames = wav_fh.readframes(n_bytes)
            #print('read {} bytes'.format(len(frames)))
            # if not frames:
            #     print('break')
            #     break
            if frame:
                #ch1 = cutChannelFromStream(frame, 1, 2)
                print('got frame from speakers')
                stream.write(frame)
            time.sleep(1)

    finally:
        # stop recognition and clean up
        stream.close()
        recognizer.stop_continuous_recognition()

    print(finalResultSRC)
    # recognizer.stop_continuous_recognition()
    # </TranslationContinuous>


translation_continuous()

回答1:

I found a working solution. I had indeed to downsample to 16000hz and use mono channel. I base my code on this Solution, but using stream chunk rather than read from file.

My function was:

def downsampleFrames(data, inrate=48000, outrate=16000, inchannels=2, outchannels=1):
    try:
        converted = audioop.ratecv(data, 2, inchannels, inrate, outrate, None)
        if outchannels == 1:
            converted = audioop.tomono(converted[0], 2, 1, 0)
    except:
        print('Failed to downsample')
        return False

    return converted

and from pyaudio, i send a chunk of data like this :

p = pyaudio.PyAudio()
        pstream = p.open(
            format=pyaudio.paInt16,
            channels=2, rate=RATE,
            input=True, frames_per_buffer=CHUNK,
            input_device_index=5,
            as_loopback=True
        )
        while(True):
            frame = pstream.read(CHUNK)
            if frame:
                downFrame = downsampleFrames(frame)
                stream.write(downFrame)

来源：https://stackoverflow.com/questions/65586642/translate-audio-from-speaker-output-in-python-with-azuresdk

标签

python

azure

audio

stream

pyaudio