From 2fcd3ee3e5dbf65841d5e457aa1a558fee471433 Mon Sep 17 00:00:00 2001 From: dec05eba Date: Wed, 1 Apr 2020 19:25:16 +0200 Subject: [PATCH] Add audio support --- README.md | 5 +- include/sound.hpp | 10 +- project.conf | 3 +- src/main.cpp | 260 +++++++++++++++++++++++++++++++++++++--------- src/sound.cpp | 19 ++-- 5 files changed, 233 insertions(+), 64 deletions(-) diff --git a/README.md b/README.md index 279b8de..974214d 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,8 @@ This is a screen recorder that has minimal impact on system performance by recording a window using the GPU only, similar to shadowplay on windows. +The output is an h264 encoded video with aac audio. + This project is still early in development. # Performance @@ -12,11 +14,10 @@ the fps remains at 30. `gpu-screen-recorder 0x1c00001 mp4 60 > test_video.mp4` # Requirements -X11, Nvidia (cuda) +X11, Nvidia (cuda), pulseaudio # TODO * Scale video when the window is rescaled. -* Use the sound source in src/sound.cpp to record audio and mux it with ffmpeg to the final video. * Support AMD and Intel, using VAAPI. cuda and vaapi should be loaded at runtime using dlopen instead of linking to those libraries at compile-time. * Clean up the code! diff --git a/include/sound.hpp b/include/sound.hpp index 2b07254..c512c31 100644 --- a/include/sound.hpp +++ b/include/sound.hpp @@ -3,7 +3,7 @@ typedef struct { void *handle; - char *buffer; + void *buffer; int buffer_size; unsigned int frames; } SoundDevice; @@ -14,14 +14,16 @@ typedef struct { to clean up internal resources. Returns 0 on success, or a negative value on failure. */ -int sound_device_get_by_name(SoundDevice *device, const char *name = "default", unsigned int num_channels = 1, unsigned int period_frame_size = 32); +int sound_device_get_by_name(SoundDevice *device, const char *name = "default", unsigned int num_channels = 2, unsigned int period_frame_size = 32); void sound_device_close(SoundDevice *device); /* Returns the next chunk of audio into @buffer. - Returns the size of the buffer, or a negative value on failure. + Returns the number of frames read, or a negative value on failure. */ -int sound_device_read_next_chunk(SoundDevice *device, char **buffer); +int sound_device_read_next_chunk(SoundDevice *device, void **buffer); + +int sound_device_get_buffer_size(SoundDevice *device); #endif /* GPU_SCREEN_RECORDER_H */ diff --git a/project.conf b/project.conf index 651def6..816b703 100644 --- a/project.conf +++ b/project.conf @@ -9,7 +9,6 @@ include_dirs = ["/opt/cuda/targets/x86_64-linux/include"] libs = ["/usr/lib/libcuda.so"] [dependencies] -ffnvcodec = ">=9" glew = ">=2" glx = ">=1" libavcodec = ">=58" @@ -23,3 +22,5 @@ xdamage = "1" glfw3 = "3" alsa = "1" + +libswresample = "3" \ No newline at end of file diff --git a/src/main.cpp b/src/main.cpp index eb32c8a..27dd225 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -3,8 +3,13 @@ #include #include #include +#include +#include + #include +#include "../include/sound.hpp" + #define GLX_GLXEXT_PROTOTYPES #include #include @@ -19,6 +24,8 @@ extern "C" { #include #include #include +#include +#include } #include @@ -28,6 +35,14 @@ extern "C" { //#include +static char av_error_buffer[AV_ERROR_MAX_STRING_SIZE]; + +static char* av_error_to_string(int err) { + if(av_strerror(err, av_error_buffer, sizeof(av_error_buffer) < 0)) + strcpy(av_error_buffer, "Unknown error"); + return av_error_buffer; +} + struct ScopedGLXFBConfig { ~ScopedGLXFBConfig() { if (configs) @@ -236,7 +251,8 @@ std::vector get_hardware_acceleration_device_names() { } static void receive_frames(AVCodecContext *av_codec_context, AVStream *stream, - AVFormatContext *av_format_context) { + AVFormatContext *av_format_context, + std::mutex &write_output_mutex) { AVPacket av_packet; av_init_packet(&av_packet); for (;;) { @@ -244,14 +260,17 @@ static void receive_frames(AVCodecContext *av_codec_context, AVStream *stream, av_packet.size = 0; int res = avcodec_receive_packet(av_codec_context, &av_packet); if (res == 0) { // we have a packet, send the packet to the muxer + assert(av_packet.stream_index == stream->id); av_packet_rescale_ts(&av_packet, av_codec_context->time_base, stream->time_base); av_packet.stream_index = stream->index; // Write the encoded video frame to disk // av_write_frame(av_format_context, &av_packet) // write(STDOUT_FILENO, av_packet.data, av_packet.size) - if (av_write_frame(av_format_context, &av_packet) < 0) { - fprintf(stderr, "Error: Failed to write frame to muxer\n"); + std::lock_guard lock(write_output_mutex); + int ret = av_write_frame(av_format_context, &av_packet); + if(ret < 0) { + fprintf(stderr, "Error: Failed to write video frame to muxer, reason: %s (%d)\n", av_error_to_string(ret), ret); } av_packet_unref(&av_packet); } else if (res == AVERROR(EAGAIN)) { // we have no packet @@ -268,7 +287,46 @@ static void receive_frames(AVCodecContext *av_codec_context, AVStream *stream, //av_packet_unref(&av_packet); } -static AVStream *add_stream(AVFormatContext *av_format_context, AVCodec **codec, +static AVStream *add_audio_stream(AVFormatContext *av_format_context, AVCodec **codec, + enum AVCodecID codec_id) { + *codec = avcodec_find_encoder(AV_CODEC_ID_AAC); + if (!*codec) { + fprintf( + stderr, + "Error: Could not find aac encoder\n"); + exit(1); + } + + AVStream *stream = avformat_new_stream(av_format_context, *codec); + if (!stream) { + fprintf(stderr, "Error: Could not allocate stream\n"); + exit(1); + } + stream->id = av_format_context->nb_streams - 1; + fprintf(stderr, "audio stream id: %d\n", stream->id); + AVCodecContext *codec_context = stream->codec; + + assert((*codec)->type == AVMEDIA_TYPE_AUDIO); + /* + codec_context->sample_fmt = (*codec)->sample_fmts + ? (*codec)->sample_fmts[0] + : AV_SAMPLE_FMT_FLTP; + */ + codec_context->codec_id = AV_CODEC_ID_AAC; + codec_context->sample_fmt = AV_SAMPLE_FMT_FLTP; + //codec_context->bit_rate = 64000; + codec_context->sample_rate = 48000; + codec_context->channel_layout = AV_CH_LAYOUT_STEREO; + codec_context->channels = 2; + + // Some formats want stream headers to be seperate + //if (av_format_context->oformat->flags & AVFMT_GLOBALHEADER) + // av_format_context->flags |= AV_CODEC_FLAG_GLOBAL_HEADER; + + return stream; +} + +static AVStream *add_video_stream(AVFormatContext *av_format_context, AVCodec **codec, enum AVCodecID codec_id, const WindowPixmap &window_pixmap, int fps) { @@ -280,8 +338,7 @@ static AVStream *add_stream(AVFormatContext *av_format_context, AVCodec **codec, if (!*codec) { fprintf( stderr, - "Error: Could not find h264_nvenc or nvenc_h264 encoder for %s\n", - avcodec_get_name(codec_id)); + "Error: Could not find h264_nvenc or nvenc_h264 encoder\n"); exit(1); } @@ -291,48 +348,33 @@ static AVStream *add_stream(AVFormatContext *av_format_context, AVCodec **codec, exit(1); } stream->id = av_format_context->nb_streams - 1; + fprintf(stderr, "video stream id: %d\n", stream->id); AVCodecContext *codec_context = stream->codec; - switch ((*codec)->type) { - case AVMEDIA_TYPE_AUDIO: { - codec_context->sample_fmt = (*codec)->sample_fmts - ? (*codec)->sample_fmts[0] - : AV_SAMPLE_FMT_FLTP; - codec_context->bit_rate = 64000; - codec_context->sample_rate = 44100; - codec_context->channels = 2; - break; - } - case AVMEDIA_TYPE_VIDEO: { - codec_context->codec_id = codec_id; - // TODO: Scale bitrate by resolution. For 4k, 8000000 is a better value - codec_context->bit_rate = 5000000; - // Resolution must be a multiple of two - codec_context->width = window_pixmap.texture_width & ~1; - codec_context->height = window_pixmap.texture_height & ~1; - // Timebase: This is the fundamental unit of time (in seconds) in terms - // of which frame timestamps are represented. For fixed-fps content, - // timebase should be 1/framerate and timestamp increments should be - // identical to 1 - codec_context->time_base.num = 1; - codec_context->time_base.den = fps; - // codec_context->framerate.num = 60; - // codec_context->framerate.den = 1; - codec_context->sample_aspect_ratio.num = 1; - codec_context->sample_aspect_ratio.den = 1; - codec_context->gop_size = - 32; // Emit one intra frame every 32 frames at most - codec_context->pix_fmt = AV_PIX_FMT_CUDA; - if (codec_context->codec_id == AV_CODEC_ID_MPEG1VIDEO) - codec_context->mb_decision = 2; + assert((*codec)->type == AVMEDIA_TYPE_VIDEO); + codec_context->codec_id = (*codec)->id; + fprintf(stderr, "codec id: %d\n", (*codec)->id); + codec_context->width = window_pixmap.texture_width & ~1; + codec_context->height = window_pixmap.texture_height & ~1; + codec_context->bit_rate = codec_context->width * codec_context->height; //5000000; + // Timebase: This is the fundamental unit of time (in seconds) in terms + // of which frame timestamps are represented. For fixed-fps content, + // timebase should be 1/framerate and timestamp increments should be + // identical to 1 + codec_context->time_base.num = 1; + codec_context->time_base.den = fps; + // codec_context->framerate.num = 60; + // codec_context->framerate.den = 1; + codec_context->sample_aspect_ratio.num = 1; + codec_context->sample_aspect_ratio.den = 1; + codec_context->gop_size = + 32; // Emit one intra frame every 32 frames at most + codec_context->pix_fmt = AV_PIX_FMT_CUDA; + if (codec_context->codec_id == AV_CODEC_ID_MPEG1VIDEO) + codec_context->mb_decision = 2; - // stream->time_base = codec_context->time_base; - // codec_context->ticks_per_frame = 30; - break; - } - default: - break; - } + // stream->time_base = codec_context->time_base; + // codec_context->ticks_per_frame = 30; // Some formats want stream headers to be seperate if (av_format_context->oformat->flags & AVFMT_GLOBALHEADER) @@ -341,6 +383,36 @@ static AVStream *add_stream(AVFormatContext *av_format_context, AVCodec **codec, return stream; } +static AVFrame* open_audio(AVCodec *codec, AVStream *stream) { + int ret; + AVCodecContext *codec_context = stream->codec; + + ret = avcodec_open2(codec_context, codec, nullptr); + if(ret < 0) { + fprintf(stderr, "failed to open codec, reason: %s\n", av_error_to_string(ret)); + exit(1); + } + + AVFrame *frame = av_frame_alloc(); + if(!frame) { + fprintf(stderr, "failed to allocate audio frame\n"); + exit(1); + } + + frame->nb_samples = codec_context->frame_size; + frame->format = codec_context->sample_fmt; + frame->channels = codec_context->channels; + frame->channel_layout = codec_context->channel_layout; + + ret = av_frame_get_buffer(frame, 0); + if(ret < 0) { + fprintf(stderr, "failed to allocate audio data buffers, reason: %s\n", av_error_to_string(ret)); + exit(1); + } + + return frame; +} + static void open_video(AVCodec *codec, AVStream *stream, WindowPixmap &window_pixmap, AVBufferRef **device_ctx, CUgraphicsResource *cuda_graphics_resource) { @@ -528,15 +600,24 @@ int main(int argc, char **argv) { } AVOutputFormat *output_format = av_format_context->oformat; + AVCodec *video_codec; AVStream *video_stream = - add_stream(av_format_context, &video_codec, output_format->video_codec, + add_video_stream(av_format_context, &video_codec, output_format->video_codec, window_pixmap, fps); if (!video_stream) { fprintf(stderr, "Error: Failed to create video stream\n"); return 1; } + AVCodec *audio_codec; + AVStream *audio_stream = + add_audio_stream(av_format_context, &audio_codec, output_format->audio_codec); + if (!audio_stream) { + fprintf(stderr, "Error: Failed to create audio stream\n"); + return 1; + } + if (cuInit(0) < 0) { fprintf(stderr, "Error: cuInit failed\n"); return {}; @@ -547,7 +628,9 @@ int main(int argc, char **argv) { open_video(video_codec, video_stream, window_pixmap, &device_ctx, &cuda_graphics_resource); - av_dump_format(av_format_context, 0, filename, 1); + AVFrame *audio_frame = open_audio(audio_codec, audio_stream); + + //av_dump_format(av_format_context, 0, filename, 1); if (!(output_format->flags & AVFMT_NOFILE)) { int ret = avio_open(&av_format_context->pb, filename, AVIO_FLAG_WRITE); @@ -635,6 +718,69 @@ int main(int argc, char **argv) { int window_width = xwa.width; int window_height = xwa.height; + SoundDevice sound_device; + if(sound_device_get_by_name(&sound_device, "pulse", audio_stream->codec->channels, audio_stream->codec->frame_size) != 0) { + fprintf(stderr, "failed to get 'pulse' sound device\n"); + exit(1); + } + + int audio_buffer_size = av_samples_get_buffer_size(NULL, audio_stream->codec->channels, audio_stream->codec->frame_size, audio_stream->codec->sample_fmt, 1); + uint8_t *audio_frame_buf = (uint8_t *)av_malloc(audio_buffer_size); + avcodec_fill_audio_frame(audio_frame, audio_stream->codec->channels, audio_stream->codec->sample_fmt, (const uint8_t*)audio_frame_buf, audio_buffer_size, 1); + + AVPacket audio_packet; + av_new_packet(&audio_packet, audio_buffer_size); + + std::mutex write_output_mutex; + + bool running = true; + std::thread audio_thread([&running](AVFormatContext *av_format_context, AVStream *audio_stream, AVPacket *audio_packet, uint8_t *audio_frame_buf, SoundDevice *sound_device, AVFrame *audio_frame, std::mutex *write_output_mutex) { + SwrContext *swr = swr_alloc(); + if(!swr) { + fprintf(stderr, "Failed to create SwrContext\n"); + exit(1); + } + av_opt_set_int(swr, "in_channel_layout", audio_stream->codec->channel_layout, 0); + av_opt_set_int(swr, "out_channel_layout", audio_stream->codec->channel_layout, 0); + av_opt_set_int(swr, "in_sample_rate", audio_stream->codec->sample_rate, 0); + av_opt_set_int(swr, "out_sample_rate", audio_stream->codec->sample_rate, 0); + av_opt_set_sample_fmt(swr, "in_sample_fmt", AV_SAMPLE_FMT_S16, 0); + av_opt_set_sample_fmt(swr, "out_sample_fmt", AV_SAMPLE_FMT_FLTP, 0); + swr_init(swr); + + while(running) { + void *sound_buffer; + int sound_buffer_size = sound_device_read_next_chunk(sound_device, &sound_buffer); + if(sound_buffer_size >= 0) { + // TODO: Instead of converting audio, get float audio from alsa. Or does alsa do conversion internally to get this format? + swr_convert(swr, &audio_frame_buf, audio_frame->nb_samples, (const uint8_t**)&sound_buffer, sound_buffer_size); + audio_frame->extended_data = &audio_frame_buf; + // TODO: Fix this. Warning from ffmpeg: + // Timestamps are unset in a packet for stream 1. This is deprecated and will stop working in the future. Fix your code to set the timestamps properly + //audio_frame->pts=audio_frame_index*100; + //++audio_frame_index; + + int got_frame = 0; + int ret = avcodec_encode_audio2(audio_stream->codec, audio_packet, audio_frame, &got_frame); + if(ret < 0){ + printf("Failed to encode!\n"); + break; + } + if (got_frame==1){ + //printf("Succeed to encode 1 frame! \tsize:%5d\n",pkt.size); + audio_packet->stream_index = audio_stream->index; + std::lock_guard lock(*write_output_mutex); + ret = av_write_frame(av_format_context, audio_packet); + av_free_packet(audio_packet); + } + } else { + fprintf(stderr, "failed to read sound from device, error: %d\n", sound_buffer_size); + } + } + + swr_free(&swr); + }, av_format_context, audio_stream, &audio_packet, audio_frame_buf, &sound_device, audio_frame, &write_output_mutex); + XEvent e; while (!glfwWindowShouldClose(window)) { glClear(GL_COLOR_BUFFER_BIT); @@ -719,7 +865,7 @@ int main(int argc, char **argv) { "Error: cuGraphicsGLRegisterImage failed, error %s, texture " "id: %u\n", err_str, window_pixmap.target_texture_id); - exit(1); + break; } res = cuGraphicsResourceSetMapFlags( @@ -730,7 +876,7 @@ int main(int argc, char **argv) { av_frame_unref(frame); if (av_hwframe_get_buffer(video_stream->codec->hw_frames_ctx, frame, 0) < 0) { fprintf(stderr, "Error: av_hwframe_get_buffer failed\n"); - exit(1); + break; } } @@ -741,7 +887,7 @@ int main(int argc, char **argv) { frame_count += 1; if (avcodec_send_frame(video_stream->codec, frame) >= 0) { receive_frames(video_stream->codec, video_stream, - av_format_context); + av_format_context, write_output_mutex); } else { fprintf(stderr, "Error: avcodec_send_frame failed\n"); } @@ -752,6 +898,20 @@ int main(int argc, char **argv) { usleep(5000); } + running = false; + audio_thread.join(); + + sound_device_close(&sound_device); + + //Flush Encoder + #if 0 + ret = flush_encoder(pFormatCtx,0); + if (ret < 0) { + printf("Flushing encoder failed\n"); + return -1; + } + #endif + if (av_write_trailer(av_format_context) != 0) { fprintf(stderr, "Failed to write trailer\n"); } diff --git a/src/sound.cpp b/src/sound.cpp index 6d23260..c9ee5e7 100644 --- a/src/sound.cpp +++ b/src/sound.cpp @@ -26,8 +26,8 @@ int sound_device_get_by_name(SoundDevice *device, const char *name, unsigned int snd_pcm_hw_params_set_format(handle, params, SND_PCM_FORMAT_S16_LE); snd_pcm_hw_params_set_channels(handle, params, num_channels); - // 44100 bits/second samling rate (CD quality) - unsigned int val = 44100; + // 48000 bits/second samling rate (DVD quality) + unsigned int val = 48000; int dir; snd_pcm_hw_params_set_rate_near(handle, params, &val, &dir); @@ -45,7 +45,7 @@ int sound_device_get_by_name(SoundDevice *device, const char *name, unsigned int // Use a buffer large enough to hold one period snd_pcm_hw_params_get_period_size(params, &frames, &dir); int buffer_size = frames * 2 * num_channels; // 2 bytes/sample, @num_channels channels - char *buffer = (char*)malloc(buffer_size); + void *buffer = malloc(buffer_size); if(!buffer) { fprintf(stderr, "failed to allocate buffer for audio\n"); snd_pcm_close(handle); @@ -61,18 +61,19 @@ int sound_device_get_by_name(SoundDevice *device, const char *name, unsigned int void sound_device_close(SoundDevice *device) { /* TODO: Is this also needed in @sound_device_get_by_name on failure? */ - snd_pcm_drain((snd_pcm_t*)device->handle); + // TODO: This has been commented out since it causes the thread to block forever. Why? + //snd_pcm_drain((snd_pcm_t*)device->handle); snd_pcm_close((snd_pcm_t*)device->handle); free(device->buffer); } -int sound_device_read_next_chunk(SoundDevice *device, char **buffer) { +int sound_device_read_next_chunk(SoundDevice *device, void **buffer) { int rc = snd_pcm_readi((snd_pcm_t*)device->handle, device->buffer, device->frames); if (rc == -EPIPE) { /* overrun */ fprintf(stderr, "overrun occured\n"); snd_pcm_prepare((snd_pcm_t*)device->handle); - return 0; + return rc; } else if(rc < 0) { fprintf(stderr, "failed to read from sound device, reason: %s\n", snd_strerror(rc)); return rc; @@ -80,5 +81,9 @@ int sound_device_read_next_chunk(SoundDevice *device, char **buffer) { fprintf(stderr, "short read, read %d frames\n", rc); } *buffer = device->buffer; - return 0; + return rc; +} + +int sound_device_get_buffer_size(SoundDevice *device) { + return device->buffer_size; }