"""
StreamWriter Basic Usage
========================

**Author**: `Moto Hira <moto@meta.com>`__

This tutorial shows how to use :py:class:`torchaudio.io.StreamWriter` to
encode and save audio/video data into various formats/destinations.

"""

######################################################################
#
# .. note::
#
#    This tutorial requires FFmpeg libraries.
#    Please refer to :ref:`FFmpeg dependency <ffmpeg_dependency>` for
#    the detail.
#

######################################################################
#
# .. warning::
#
#    TorchAudio dynamically loads compatible FFmpeg libraries
#    installed on the system.
#    The types of supported formats (media format, encoder, encoder
#    options, etc) depend on the libraries.
#
#    To check the available muxers and encoders, you can use the
#    following command
#
#    .. code-block:: console
#
#       ffmpeg -muxers
#       ffmpeg -encoders

######################################################################
#
# Preparation
# -----------

import torch
import torchaudio

print(torch.__version__)
print(torchaudio.__version__)

from torchaudio.io import StreamWriter

print("FFmpeg library versions")
for k, v in torchaudio.utils.ffmpeg_utils.get_versions().items():
    print(f"  {k}: {v}")

######################################################################
#

import io
import os
import tempfile

from IPython.display import Audio, Video

from torchaudio.utils import download_asset

SAMPLE_PATH = download_asset("tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav")
WAVEFORM, SAMPLE_RATE = torchaudio.load(SAMPLE_PATH, channels_first=False)
NUM_FRAMES, NUM_CHANNELS = WAVEFORM.shape

_BASE_DIR = tempfile.TemporaryDirectory()


def get_path(filename):
    return os.path.join(_BASE_DIR.name, filename)


######################################################################
#
# The basic usage
# ---------------
#
# To save Tensor data into media formats with StreamWriter, there
# are three necessary steps
#
# 1. Specify the output
# 2. Configure streams
# 3. Write data
#
# The following code illustrates how to save audio data as WAV file.
#

######################################################################
#

# 1. Define the destination. (local file in this case)
path = get_path("test.wav")
s = StreamWriter(path)

######################################################################
#

# 2. Configure the stream. (8kHz, Stereo WAV)
s.add_audio_stream(
    sample_rate=SAMPLE_RATE,
    num_channels=NUM_CHANNELS,
)

######################################################################
#

# 3. Write the data
with s.open():
    s.write_audio_chunk(0, WAVEFORM)

######################################################################
#

Audio(path)

######################################################################
#
# Now we look into each step in more detail.

######################################################################
#
# Write destination
# -----------------
#
# StreamWriter supports different types of write destinations
#
# 1. Local files
# 2. File-like objects
# 3. Streaming protocols (such as RTMP and UDP)
# 4. Media devices (speakers and video players) †
#
# † For media devices, please refer to
# `StreamWriter Advanced Usages <./streamwriter_advanced.html>`__.
#

######################################################################
# Local files
# ~~~~~~~~~~~
#
# StreamWriter supports saving media to local files.
#
#
# .. code::
#
#    StreamWriter(dst="audio.wav")
#
#    StreamWriter(dst="audio.mp3")
#
# This works for still images and videos as well.
#
# .. code::
#
#    StreamWriter(dst="image.jpeg")
#
#    StreamWriter(dst="video.mpeg")
#

######################################################################
# File-like objects
# ~~~~~~~~~~~~~~~~~
#
# You can also pass a file-lie object. A file-like object must implement
# ``write`` method conforming to :py:attr:`io.RawIOBase.write`.
#
# .. code::
#
#    # Open the local file as fileobj
#    with open("audio.wav", "wb") as dst:
#        StreamWriter(dst=dst)
#
# .. code::
#
#    # In-memory encoding
#    buffer = io.BytesIO()
#    StreamWriter(dst=buffer)
#

######################################################################
# Streaming protocols
# ~~~~~~~~~~~~~~~~~~~
#
# You can stream the media with streaming protocols
#
# .. code::
#
#    # Real-Time Messaging Protocol
#    StreamWriter(dst="rtmp://localhost:1234/live/app", format="flv")
#
#    # UDP
#    StreamWriter(dst="udp://localhost:48550", format="mpegts")
#

######################################################################
#
# Configuring output streams
# --------------------------
#
# Once the destination is specified, the next step is to configure the streams.
# For typical audio and still image cases, only one stream is required,
# but for video with audio, at least two streams (one for audio and the other
# for video) need to be configured.
#

######################################################################
# Audio Stream
# ~~~~~~~~~~~~
#
# An audio stream can be added with
# :py:meth:`~torchaudio.io.StreamWriter.add_audio_stream` method.
#
# For writing regular audio files, at minimum ``sample_rate`` and ``num_channels``
# are required.
#
# .. code::
#
#    s = StreamWriter("audio.wav")
#    s.add_audio_stream(sample_rate=8000, num_channels=2)
#
# By default, audio streams expect the input waveform tensors to be ``torch.float32`` type.
# If the above case, the data will be encoded into the detault encoding format of WAV format,
# which is 16-bit signed integer Linear PCM. StreamWriter converts the sample format internally.
#
# If the encoder supports multiple sample formats and you want to change the encoder sample format,
# you can use ``encoder_format`` option.
#
# In the following example, the StreamWriter expects the data type of the input waveform Tensor
# to be ``torch.float32``, but it will convert the sample to 16-bit signed integer when encoding.
#
# .. code::
#
#    s = StreamWriter("audio.mp3")
#    s.add_audio_stream(
#        ...,
#        encoder="libmp3lame",   # "libmp3lame" is often the default encoder for mp3,
#                                # but specifying it manually, for the sake of illustration.
#
#        encoder_format="s16p",  # "libmp3lame" encoder supports the following sample format.
#                                #  - "s16p" (16-bit signed integer)
#                                #  - "s32p" (32-bit signed integer)
#                                #  - "fltp" (32-bit floating point)
#    )
#
# If the data type of your waveform Tensor is something other than ``torch.float32``,
# you can provide ``format`` option to change the expected data type.
#
# The following example configures StreamWriter to expect Tensor of ``torch.int16`` type.
#
# .. code::
#
#    # Audio data passed to StreamWriter must be torch.int16
#    s.add_audio_stream(..., format="s16")
#
# The following figure illustrates how ``format`` and ``encoder_format`` options work
# for audio streams.
#
# .. image:: https://download.pytorch.org/torchaudio/tutorial-assets/streamwriter-format-audio.png
#

######################################################################
# Video Stream
# ~~~~~~~~~~~~
#
# To add a still image or a video stream, you can use
# :py:meth:`~torchaudio.io.StreamWriter.add_video_stream` method.
#
# At minimum, ``frame_rate``, ``height`` and ``width`` are required.
#
# .. code::
#
#    s = StreamWriter("video.mp4")
#    s.add_video_stream(frame_rate=10, height=96, width=128)
#
# For still images, please use ``frame_rate=1``.
#
# .. code::
#
#    s = StreamWriter("image.png")
#    s.add_video_stream(frame_rate=1, ...)
#
# Similar to the audio stream, you can provide ``format`` and ``encoder_format``
# option to controll the format of input data and encoding.
#
# The following example encodes video data in YUV422 format.
#
# .. code::
#
#    s = StreamWriter("video.mov")
#    s.add_video_stream(
#        ...,
#        encoder="libx264",  # libx264 supports different YUV formats, such as
#                            # yuv420p yuvj420p yuv422p yuvj422p yuv444p yuvj444p nv12 nv16 nv21
#
#        encoder_format="yuv422p",  # StreamWriter will convert the input data to YUV422 internally
#    )
#
# YUV formats are commonly used in video encoding. Many YUV formats are composed of chroma
# channel of different plane size than that of luma channel. This makes it difficult to
# directly express it as ``torch.Tensor`` type.
# Therefore, StreamWriter will automatically convert the input video Tensor into the target format.
#
# StreamWriter expects the input image tensor to be 4-D (`time`, `channel`, `height`, `width`)
# and ``torch.uint8`` type.
#
# The default color channel is RGB. That is three color channels corresponding red, green and blue.
# If your input has different color channel, such as BGR and YUV, you can specify it with
# ``format`` option.
#
# The following example specifies BGR format.
#
# .. code::
#
#    s.add_video_stream(..., format="bgr24")
#                       # Image data passed to StreamWriter must have
#                       # three color channels representing Blue Green Red.
#                       #
#                       # The shape of the input tensor has to be
#                       # (time, channel==3, height, width)
#
#
# The following figure illustrates how ``format`` and ``encoder_format`` options work for
# video streams.
#
# .. image:: https://download.pytorch.org/torchaudio/tutorial-assets/streamwriter-format-video.png
#

######################################################################
#
# Write data
# ----------
#
# Once streams are configured, the next step is to open the output location
# and start writing data.
#
# Use :py:meth:`~torchaudio.io.StreamWriter.open` method to open the
# destination, and then write data with :py:meth:`~torchaudio.io.StreamWriter.write_audio_chunk`
# and/or :py:meth:`~torchaudio.io.StreamWriter.write_video_chunk`.
#
# Audio tensors are expected to have the shape of `(time, channels)`,
# and video/image tensors are expected to have the shape of `(time, channels, height, width)`.
#
# Channels, height and width must match the configuration of the corresponding
# stream, specified with ``"format"`` option.
#
# Tensor representing a still image must have only one frame in time dimension,
# but audio and video tensors can have arbitral number of frames in time dimension.
#
# The following code snippet illustrates this;
#

######################################################################
# Ex) Audio
# ~~~~~~~~~
#

# Configure stream
s = StreamWriter(dst=get_path("audio.wav"))
s.add_audio_stream(sample_rate=SAMPLE_RATE, num_channels=NUM_CHANNELS)

# Write data
with s.open():
    s.write_audio_chunk(0, WAVEFORM)

######################################################################
# Ex) Image
# ~~~~~~~~~
#

# Image config
height = 96
width = 128

# Configure stream
s = StreamWriter(dst=get_path("image.png"))
s.add_video_stream(frame_rate=1, height=height, width=width, format="rgb24")

# Generate image
chunk = torch.randint(256, (1, 3, height, width), dtype=torch.uint8)

# Write data
with s.open():
    s.write_video_chunk(0, chunk)


######################################################################
# Ex) Video without audio
# ~~~~~~~~~~~~~~~~~~~~~~~
#

# Video config
frame_rate = 30
height = 96
width = 128

# Configure stream
s = StreamWriter(dst=get_path("video.mp4"))
s.add_video_stream(frame_rate=frame_rate, height=height, width=width, format="rgb24")

# Generate video chunk (3 seconds)
time = int(frame_rate * 3)
chunk = torch.randint(256, (time, 3, height, width), dtype=torch.uint8)

# Write data
with s.open():
    s.write_video_chunk(0, chunk)

######################################################################
# Ex) Video with audio
# ~~~~~~~~~~~~~~~~~~~~
#
# To write video with audio, separate streams have to be configured.
#

# Configure stream
s = StreamWriter(dst=get_path("video.mp4"))
s.add_audio_stream(sample_rate=SAMPLE_RATE, num_channels=NUM_CHANNELS)
s.add_video_stream(frame_rate=frame_rate, height=height, width=width, format="rgb24")

# Generate audio/video chunk (3 seconds)
time = int(SAMPLE_RATE * 3)
audio_chunk = torch.randn((time, NUM_CHANNELS))
time = int(frame_rate * 3)
video_chunk = torch.randint(256, (time, 3, height, width), dtype=torch.uint8)

# Write data
with s.open():
    s.write_audio_chunk(0, audio_chunk)
    s.write_video_chunk(1, video_chunk)


######################################################################
# Writing data chunk by chunk
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~
#
# When writing data, it is possible to split data along time dimension and
# write them by smaller chunks.
#

######################################################################
#

# Write data in one-go
dst1 = io.BytesIO()
s = StreamWriter(dst=dst1, format="mp3")
s.add_audio_stream(SAMPLE_RATE, NUM_CHANNELS)
with s.open():
    s.write_audio_chunk(0, WAVEFORM)

######################################################################
#

# Write data in smaller chunks
dst2 = io.BytesIO()
s = StreamWriter(dst=dst2, format="mp3")
s.add_audio_stream(SAMPLE_RATE, NUM_CHANNELS)
with s.open():
    for start in range(0, NUM_FRAMES, SAMPLE_RATE):
        end = start + SAMPLE_RATE
        s.write_audio_chunk(0, WAVEFORM[start:end, ...])

######################################################################
#

# Check that the contents are same
dst1.seek(0)
bytes1 = dst1.read()

print(f"bytes1: {len(bytes1)}")
print(f"{bytes1[:10]}...{bytes1[-10:]}\n")

dst2.seek(0)
bytes2 = dst2.read()

print(f"bytes2: {len(bytes2)}")
print(f"{bytes2[:10]}...{bytes2[-10:]}\n")

assert bytes1 == bytes2

import matplotlib.pyplot as plt

######################################################################
#
# Example - Spectrum Visualizer
# -----------------------------
#
# In this section, we use StreamWriter to create a spectrum visualization
# of audio and save it as a video file.
#
# To create spectrum visualization, we use
# :py:class:`torchaudio.transforms.Spectrogram`, to get spectrum presentation
# of audio, generate raster images of its visualization using matplotplib,
# then use StreamWriter to convert them to video with the original audio.

import torchaudio.transforms as T

######################################################################
#
# Prepare Data
# ~~~~~~~~~~~~
#
# First, we prepare the spectrogram data.
# We use :py:class:`~torchaudio.transforms.Spectrogram`.
#
# We adjust ``hop_length`` so that one frame of the spectrogram corresponds
# to one video frame.
#

frame_rate = 20
n_fft = 4000

trans = T.Spectrogram(
    n_fft=n_fft,
    hop_length=SAMPLE_RATE // frame_rate,  # One FFT per one video frame
    normalized=True,
    power=1,
)
specs = trans(WAVEFORM.T)[0].T

######################################################################
#
# The resulting spectrogram looks like the following.
#

spec_db = T.AmplitudeToDB(stype="magnitude", top_db=80)(specs.T)
_ = plt.imshow(spec_db, aspect="auto", origin="lower")

######################################################################
#
# Prepare Canvas
# ~~~~~~~~~~~~~~
#
# We use ``matplotlib`` to visualize the spectrogram per frame.
# We create a helper function that plots the spectrogram data and
# generates a raster imager of the figure.
#

fig, ax = plt.subplots(figsize=[3.2, 2.4])
ax.set_position([0, 0, 1, 1])
ax.set_facecolor("black")
ncols, nrows = fig.canvas.get_width_height()


def _plot(data):
    ax.clear()
    x = list(range(len(data)))
    R, G, B = 238 / 255, 76 / 255, 44 / 255
    for coeff, alpha in [(0.8, 0.7), (1, 1)]:
        d = data**coeff
        ax.fill_between(x, d, -d, color=[R, G, B, alpha])
    xlim = n_fft // 2 + 1
    ax.set_xlim([-1, n_fft // 2 + 1])
    ax.set_ylim([-1, 1])
    ax.text(
        xlim,
        0.95,
        f"Created with TorchAudio\n{torchaudio.__version__}",
        color="white",
        ha="right",
        va="top",
        backgroundcolor="black",
    )
    fig.canvas.draw()
    frame = torch.frombuffer(fig.canvas.tostring_rgb(), dtype=torch.uint8)
    return frame.reshape(nrows, ncols, 3).permute(2, 0, 1)


# sphinx_gallery_defer_figures

######################################################################
#
# Write Video
# ~~~~~~~~~~~
#
# Finally, we use StreamWriter and write video.
# We process one second of audio and video frames at a time.
#

s = StreamWriter(get_path("example.mp4"))
s.add_audio_stream(sample_rate=SAMPLE_RATE, num_channels=NUM_CHANNELS)
s.add_video_stream(frame_rate=frame_rate, height=nrows, width=ncols)

with s.open():
    i = 0
    # Process by second
    for t in range(0, NUM_FRAMES, SAMPLE_RATE):
        # Write audio chunk
        s.write_audio_chunk(0, WAVEFORM[t : t + SAMPLE_RATE, :])

        # write 1 second of video chunk
        frames = [_plot(spec) for spec in specs[i : i + frame_rate]]
        if frames:
            s.write_video_chunk(1, torch.stack(frames))
        i += frame_rate

plt.close(fig)

######################################################################
#
# Result
# ~~~~~~
#
# The result looks like below.
#
#

Video(get_path("example.mp4"), embed=True)

######################################################################
#
# Carefully watching the video, it can be
# observed that the sound of "s" (curio\ **si**\ ty, be\ **si**\ des, thi\ **s**\ ) has
# more energy allocated on higher frequency side (right side of the video).

######################################################################
#
# Tag: :obj:`torchaudio.io`
#