问题
I'm looking to make an application, who woul'd let me translate any audio going out of the speaker in live stream. This way, i will be able to translate any videoconference from any live stream app(youtube,teams,zoom,etc.). I'm not far from a solution, but not there yet.
Src language would be: fr-CA
or en-US
Dst language would be : fr-Ca
or en-US
I was able to get audio stream back from speaker with a custom version of pyaudio
allowing loopback with the WASAPI of windows.(https://github.com/intxcc/pyaudio_portaudio)
The next step is to shoot the stream in realtime to Azure translate api in the speechsdk
.
so far, the part of getting the stream from speakers is working, but when i plug it with azure, i don't have any error, but it doesn't return any result either. In fact, every like 30 second i recive a reason=ResultReason.NoMatch
or a bribe of text that make no sens.
My first though is that the stream byte coming from speaker witch is 48khz, 2 channels is not supported by Azure stream.(i think i read somewhere on the net that it support only 16khz 1 channel, but i'm not sure). And if that so, i found a way to split split two channel into 1, but i don't know how to drop from 48khz to 16khz from a chunk of bytes in realtime..
Any help would be appreciated! Thanks. Here my code:
import time
import azure.cognitiveservices.speech as speechsdk
import pyaudio
import numpy as np
speech_key, service_region = "", "westus"
finalResultSRC = ""
finalResultDst = ""
RATE = 48000
KHz_RATE = int(RATE/1000)
CHUNK = int(RATE)
def translation_continuous():
"""performs continuous speech translation from input from an audio file"""
# <TranslationContinuous>
# set up translation parameters: source language and target languages
translation_config = speechsdk.translation.SpeechTranslationConfig(
subscription=speech_key, region=service_region,
speech_recognition_language='fr-CA')
# setup the audio stream
audioFormat = speechsdk.audio.AudioStreamFormat(
samples_per_second=KHz_RATE, bits_per_sample=16, channels=2)
stream = speechsdk.audio.PushAudioInputStream(audioFormat)
translation_config.add_target_language("en-US")
stream = speechsdk.audio.PushAudioInputStream()
audio_config = speechsdk.audio.AudioConfig(stream=stream)
# Creates a translation recognizer using and audio file as input.
recognizer = speechsdk.translation.TranslationRecognizer(
translation_config=translation_config, audio_config=audio_config)
def result_callback(event_type, evt):
"""callback to display a translation result"""
# print("{}: {}\n\tTranslations: {}\n\tResult Json: {}".format(
# event_type, evt, evt.result.translations.items(), evt.result.json))
print(evt)
if event_type == "RECOGNIZING":
# Translate
print(evt.result.translations.items()[0][1])
# Original
# print(type(evt.result.json))
done = False
def stop_cb(evt):
"""callback that signals to stop continuous recognition upon receiving an event `evt`"""
print('CLOSING on {}'.format(evt))
nonlocal done
done = True
# connect callback functions to the events fired by the recognizer
recognizer.session_started.connect(
lambda evt: print('SESSION STARTED: {}'.format(evt)))
recognizer.session_stopped.connect(
lambda evt: print('SESSION STOPPED {}'.format(evt)))
# event for intermediate results
recognizer.recognizing.connect(
lambda evt: result_callback('RECOGNIZING', evt))
# event for final result
recognizer.recognized.connect(
lambda evt: result_callback('RECOGNIZED', evt))
# cancellation event
recognizer.canceled.connect(lambda evt: print(
'CANCELED: {} ({})'.format(evt, evt.reason)))
# stop continuous recognition on either session stopped or canceled events
recognizer.session_stopped.connect(stop_cb)
recognizer.canceled.connect(stop_cb)
def synthesis_callback(evt):
"""
callback for the synthesis event
"""
print('SYNTHESIZING {}\n\treceived {} bytes of audio. Reason: {}'.format(
evt, len(evt.result.audio), evt.result.reason))
# connect callback to the synthesis event
recognizer.synthesizing.connect(synthesis_callback)
# start translation
recognizer.start_continuous_recognition()
# start pushing data until all data has been read from the file
try:
p = pyaudio.PyAudio()
pstream = p.open(
format=pyaudio.paInt16,
channels=2, rate=RATE,
input=True, frames_per_buffer=CHUNK,
input_device_index=5,
as_loopback=True
)
while(True):
frame = pstream.read(CHUNK)
#frames = wav_fh.readframes(n_bytes)
#print('read {} bytes'.format(len(frames)))
# if not frames:
# print('break')
# break
if frame:
#ch1 = cutChannelFromStream(frame, 1, 2)
print('got frame from speakers')
stream.write(frame)
time.sleep(1)
finally:
# stop recognition and clean up
stream.close()
recognizer.stop_continuous_recognition()
print(finalResultSRC)
# recognizer.stop_continuous_recognition()
# </TranslationContinuous>
translation_continuous()
回答1:
I found a working solution. I had indeed to downsample to 16000hz and use mono channel. I base my code on this Solution, but using stream chunk rather than read from file.
My function was:
def downsampleFrames(data, inrate=48000, outrate=16000, inchannels=2, outchannels=1):
try:
converted = audioop.ratecv(data, 2, inchannels, inrate, outrate, None)
if outchannels == 1:
converted = audioop.tomono(converted[0], 2, 1, 0)
except:
print('Failed to downsample')
return False
return converted
and from pyaudio, i send a chunk of data like this :
p = pyaudio.PyAudio()
pstream = p.open(
format=pyaudio.paInt16,
channels=2, rate=RATE,
input=True, frames_per_buffer=CHUNK,
input_device_index=5,
as_loopback=True
)
while(True):
frame = pstream.read(CHUNK)
if frame:
downFrame = downsampleFrames(frame)
stream.write(downFrame)
来源:https://stackoverflow.com/questions/65586642/translate-audio-from-speaker-output-in-python-with-azuresdk