-
-
Notifications
You must be signed in to change notification settings - Fork 186
feat: add audio narration (updated) #346
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
351d87b
f19a84a
e143767
6f07b93
d3ef09a
9e86193
87a814f
ce84a1b
5c584b2
802c8a2
aca8cdc
42b1007
9f4c280
20d29e1
109ffe0
8d27b4f
d631b2d
ab0805e
e30538b
9469043
47bf845
e9f2d36
9293b0b
a66acbc
888d335
e1a3a18
d7c54f2
3eaa3a8
05834c4
a6e45bd
f23df51
f6cdbc0
873cf6d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
"""Add audio info. | ||
|
||
Revision ID: c176288cb508 | ||
Revises: 8713b142f5de | ||
Create Date: 2023-08-31 00:25:04.889325 | ||
|
||
""" | ||
import sqlalchemy as sa | ||
|
||
from alembic import op | ||
from openadapt.models import ForceFloat | ||
|
||
# revision identifiers, used by Alembic. | ||
revision = "c176288cb508" | ||
down_revision = "8713b142f5de" | ||
branch_labels = None | ||
depends_on = None | ||
|
||
|
||
def upgrade() -> None: | ||
# ### commands auto generated by Alembic - please adjust! ### | ||
op.create_table( | ||
"audio_info", | ||
sa.Column("id", sa.Integer(), nullable=False), | ||
sa.Column("flac_data", sa.LargeBinary(), nullable=True), | ||
sa.Column("transcribed_text", sa.String(), nullable=True), | ||
sa.Column( | ||
"recording_timestamp", | ||
ForceFloat(precision=10, scale=2, asdecimal=False), | ||
nullable=True, | ||
), | ||
sa.Column("sample_rate", sa.Integer(), nullable=True), | ||
sa.Column("words_with_timestamps", sa.Text(), nullable=True), | ||
sa.ForeignKeyConstraint( | ||
["recording_timestamp"], | ||
["recording.timestamp"], | ||
name=op.f("fk_audio_info_recording_timestamp_recording"), | ||
), | ||
sa.PrimaryKeyConstraint("id", name=op.f("pk_audio_info")), | ||
) | ||
# ### end Alembic commands ### | ||
|
||
|
||
def downgrade() -> None: | ||
# ### commands auto generated by Alembic - please adjust! ### | ||
op.drop_table("audio_info") | ||
# ### end Alembic commands ### |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,11 +4,16 @@ | |
|
||
$ python openadapt/record.py "<description of task to be recorded>" | ||
|
||
To record audio: | ||
|
||
$ python openadapt/record.py "<description of task to be recorded>" --enable_audio | ||
|
||
""" | ||
|
||
from collections import namedtuple | ||
from functools import partial, wraps | ||
from typing import Any, Callable, Union | ||
import io | ||
import multiprocessing | ||
import os | ||
import queue | ||
|
@@ -24,7 +29,11 @@ | |
from tqdm import tqdm | ||
import fire | ||
import mss.tools | ||
import numpy as np | ||
import psutil | ||
import sounddevice | ||
import soundfile | ||
import whisper | ||
|
||
from openadapt import config, utils, window | ||
from openadapt.db import crud | ||
|
@@ -804,15 +813,101 @@ def read_mouse_events( | |
mouse_listener.stop() | ||
|
||
|
||
def record_audio( | ||
terminate_event: multiprocessing.Event, | ||
recording_timestamp: float, | ||
) -> None: | ||
angelala3252 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
"""Record audio narration during the recording and store data in database. | ||
|
||
Args: | ||
terminate_event: The event to signal termination of event reading. | ||
recording_timestamp: The timestamp of the recording. | ||
""" | ||
utils.configure_logging(logger, LOG_LEVEL) | ||
utils.set_start_time(recording_timestamp) | ||
|
||
audio_frames = [] # to store audio frames | ||
|
||
def audio_callback( | ||
indata: np.ndarray, frames: int, time: Any, status: sounddevice.CallbackFlags | ||
) -> None: | ||
"""Callback function used when new audio frames are recorded. | ||
|
||
Note: time is of type cffi.FFI.CData, but since we don't use this argument | ||
and we also don't use the cffi library, the Any type annotation is used. | ||
""" | ||
# called whenever there is new audio frames | ||
audio_frames.append(indata.copy()) | ||
|
||
# open InputStream and start recording while ActionEvents are recorded | ||
audio_stream = sounddevice.InputStream( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @angelala3252 @0dm what is the easiest way to implement a MacOS-compatible analog of this? Can we re-use existing code in other PRs? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. #362 has a good method of getting audio devices via Apple AVFoundation, I'm sure it can be used here with minimal issue. I'm not sure if it'll be plug & play with my PR though, would need some changes depending on the implementation. |
||
callback=audio_callback, samplerate=16000, channels=1 | ||
) | ||
logger.info("Audio recording started.") | ||
audio_stream.start() | ||
terminate_event.wait() | ||
audio_stream.stop() | ||
audio_stream.close() | ||
|
||
# Concatenate into one Numpy array | ||
concatenated_audio = np.concatenate(audio_frames, axis=0) | ||
# convert concatenated_audio to format expected by whisper | ||
converted_audio = concatenated_audio.flatten().astype(np.float32) | ||
|
||
# Convert audio to text using OpenAI's Whisper | ||
logger.info("Transcribing audio...") | ||
model = whisper.load_model("base") | ||
result_info = model.transcribe(converted_audio, word_timestamps=True, fp16=False) | ||
logger.info(f"The narrated text is: {result_info['text']}") | ||
# empty word_list if the user didn't say anything | ||
word_list = [] | ||
# segments could be empty | ||
if len(result_info["segments"]) > 0: | ||
# there won't be a 'words' list if the user didn't say anything | ||
if "words" in result_info["segments"][0]: | ||
word_list = result_info["segments"][0]["words"] | ||
|
||
# compress and convert to bytes to save to database | ||
logger.info( | ||
"Size of uncompressed audio data: {} bytes".format(converted_audio.nbytes) | ||
) | ||
# Create an in-memory file-like object | ||
file_obj = io.BytesIO() | ||
# Write the audio data using lossless compression | ||
soundfile.write( | ||
file_obj, converted_audio, int(audio_stream.samplerate), format="FLAC" | ||
) | ||
# Get the compressed audio data as bytes | ||
compressed_audio_bytes = file_obj.getvalue() | ||
|
||
logger.info( | ||
"Size of compressed audio data: {} bytes".format(len(compressed_audio_bytes)) | ||
) | ||
|
||
file_obj.close() | ||
|
||
# To decompress the audio and restore it to its original form: | ||
# restored_audio, restored_samplerate = sf.read( | ||
# io.BytesIO(compressed_audio_bytes)) | ||
|
||
# Create AudioInfo entry | ||
crud.insert_audio_info( | ||
compressed_audio_bytes, | ||
result_info["text"], | ||
recording_timestamp, | ||
int(audio_stream.samplerate), | ||
word_list, | ||
) | ||
|
||
|
||
@logger.catch | ||
@trace(logger) | ||
def record( | ||
task_description: str, | ||
) -> None: | ||
def record(task_description: str, enable_audio: bool = False) -> None: | ||
"""Record Screenshots/ActionEvents/WindowEvents. | ||
|
||
Args: | ||
task_description: A text description of the task to be recorded. | ||
enable_audio: a flag to enable or disable audio recording (default: False) | ||
""" | ||
logger.info(f"{task_description=}") | ||
|
||
|
@@ -943,6 +1038,13 @@ def record( | |
) | ||
mem_plotter.start() | ||
|
||
if enable_audio: | ||
audio_recorder = threading.Thread( | ||
target=record_audio, | ||
args=(terminate_event, recording_timestamp), | ||
) | ||
audio_recorder.start() | ||
|
||
# TODO: discard events until everything is ready | ||
|
||
collect_stats() | ||
|
@@ -972,6 +1074,9 @@ def record( | |
screen_event_writer.join() | ||
action_event_writer.join() | ||
window_event_writer.join() | ||
if enable_audio: | ||
audio_recorder.join() | ||
|
||
terminate_perf_event.set() | ||
|
||
if PLOT_PERFORMANCE: | ||
|
Uh oh!
There was an error while loading. Please reload this page.