diff --git a/src/main.cpp b/src/main.cpp
index e7aa115..71452e6 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -549,7 +549,7 @@ static const AVCodec* find_av1_encoder(gsr_gpu_vendor vendor, const char *card_p
     return checked_success ? codec : nullptr;
 }
 
-static AVFrame* open_audio(AVCodecContext *audio_codec_context) {
+static void open_audio(AVCodecContext *audio_codec_context) {
     AVDictionary *options = nullptr;
     av_dict_set(&options, "strict", "experimental", 0);
 
@@ -559,7 +559,9 @@ static AVFrame* open_audio(AVCodecContext *audio_codec_context) {
         fprintf(stderr, "failed to open codec, reason: %s\n", av_error_to_string(ret));
         _exit(1);
     }
+}
 
+static AVFrame* create_audio_frame(AVCodecContext *audio_codec_context) {
     AVFrame *frame = av_frame_alloc();
     if(!frame) {
         fprintf(stderr, "failed to allocate audio frame\n");
@@ -576,7 +578,7 @@ static AVFrame* open_audio(AVCodecContext *audio_codec_context) {
     av_channel_layout_copy(&frame->ch_layout, &audio_codec_context->ch_layout);
 #endif
 
-    ret = av_frame_get_buffer(frame, 0);
+    int ret = av_frame_get_buffer(frame, 0);
     if(ret < 0) {
         fprintf(stderr, "failed to allocate audio data buffers, reason: %s\n", av_error_to_string(ret));
         _exit(1);
@@ -924,13 +926,13 @@ struct AudioDevice {
     SoundDevice sound_device;
     AudioInput audio_input;
     AVFilterContext *src_filter_ctx = nullptr;
+    AVFrame *frame = nullptr;
     std::thread thread; // TODO: Instead of having a thread for each track, have one thread for all threads and read the data with non-blocking read
 };
 
 // TODO: Cleanup
 struct AudioTrack {
     AVCodecContext *codec_context = nullptr;
-    AVFrame *frame = nullptr;
     AVStream *stream = nullptr;
 
     std::vector<AudioDevice> audio_devices;
@@ -1980,7 +1982,7 @@ int main(int argc, char **argv) {
         if(replay_buffer_size_secs == -1)
             audio_stream = create_stream(av_format_context, audio_codec_context);
 
-        AVFrame *audio_frame = open_audio(audio_codec_context);
+        open_audio(audio_codec_context);
         if(audio_stream)
             avcodec_parameters_from_context(audio_stream->codecpar, audio_codec_context);
 
@@ -2020,11 +2022,13 @@ int main(int argc, char **argv) {
             if(audio_input.name.empty()) {
                 audio_device.sound_device.handle = NULL;
                 audio_device.sound_device.frames = 0;
+                audio_device.frame = NULL;
             } else {
                 if(sound_device_get_by_name(&audio_device.sound_device, audio_input.name.c_str(), audio_input.description.c_str(), num_channels, audio_codec_context->frame_size, audio_codec_context_get_audio_format(audio_codec_context)) != 0) {
                     fprintf(stderr, "Error: failed to get \"%s\" sound device\n", audio_input.name.c_str());
                     _exit(1);
                 }
+                audio_device.frame = create_audio_frame(audio_codec_context);
             }
 
             audio_devices.push_back(std::move(audio_device));
@@ -2032,7 +2036,6 @@ int main(int argc, char **argv) {
 
         AudioTrack audio_track;
         audio_track.codec_context = audio_codec_context;
-        audio_track.frame = audio_frame;
         audio_track.stream = audio_stream;
         audio_track.audio_devices = std::move(audio_devices);
         audio_track.graph = graph;
@@ -2138,14 +2141,14 @@ int main(int argc, char **argv) {
                     if(got_audio_data)
                         received_audio_time = this_audio_frame_time;
 
-                    int ret = av_frame_make_writable(audio_track.frame);
+                    int ret = av_frame_make_writable(audio_device.frame);
                     if (ret < 0) {
                         fprintf(stderr, "Failed to make audio frame writable\n");
                         break;
                     }
 
                     // TODO: Is this |received_audio_time| really correct?
-                    int64_t num_missing_frames = std::round((this_audio_frame_time - received_audio_time) / target_audio_hz / (int64_t)audio_track.frame->nb_samples);
+                    int64_t num_missing_frames = std::round((this_audio_frame_time - received_audio_time) / target_audio_hz / (int64_t)audio_device.frame->nb_samples);
                     if(got_audio_data)
                         num_missing_frames = std::max((int64_t)0, num_missing_frames - 1);
 
@@ -2164,29 +2167,29 @@ int main(int argc, char **argv) {
                         //audio_track.frame->data[0] = empty_audio;
                         received_audio_time = this_audio_frame_time;
                         if(needs_audio_conversion)
-                            swr_convert(swr, &audio_track.frame->data[0], audio_track.frame->nb_samples, (const uint8_t**)&empty_audio, audio_track.codec_context->frame_size);
+                            swr_convert(swr, &audio_device.frame->data[0], audio_device.frame->nb_samples, (const uint8_t**)&empty_audio, audio_track.codec_context->frame_size);
                         else
-                            audio_track.frame->data[0] = empty_audio;
+                            audio_device.frame->data[0] = empty_audio;
 
                         // TODO: Check if duplicate frame can be saved just by writing it with a different pts instead of sending it again
                         std::lock_guard<std::mutex> lock(audio_filter_mutex);
                         for(int i = 0; i < num_missing_frames; ++i) {
                             if(audio_track.graph) {
                                 // TODO: av_buffersrc_add_frame
-                                if(av_buffersrc_write_frame(audio_device.src_filter_ctx, audio_track.frame) < 0) {
+                                if(av_buffersrc_write_frame(audio_device.src_filter_ctx, audio_device.frame) < 0) {
                                     fprintf(stderr, "Error: failed to add audio frame to filter\n");
                                 }
                             } else {
-                                audio_track.frame->pts = (this_audio_frame_time - record_start_time) * (double)AV_TIME_BASE;
-                                const bool same_pts = audio_track.frame->pts == prev_pts;
-                                prev_pts = audio_track.frame->pts;
+                                audio_device.frame->pts = (this_audio_frame_time - record_start_time) * (double)AV_TIME_BASE;
+                                const bool same_pts = audio_device.frame->pts == prev_pts;
+                                prev_pts = audio_device.frame->pts;
                                 if(same_pts)
                                     continue;
 
-                                ret = avcodec_send_frame(audio_track.codec_context, audio_track.frame);
+                                ret = avcodec_send_frame(audio_track.codec_context, audio_device.frame);
                                 if(ret >= 0) {
                                     // TODO: Move to separate thread because this could write to network (for example when livestreaming)
-                                    receive_frames(audio_track.codec_context, audio_track.stream_index, audio_track.stream, audio_track.frame->pts, av_format_context, record_start_time, frame_data_queue, replay_buffer_size_secs, frames_erased, write_output_mutex);
+                                    receive_frames(audio_track.codec_context, audio_track.stream_index, audio_track.stream, audio_device.frame->pts, av_format_context, record_start_time, frame_data_queue, replay_buffer_size_secs, frames_erased, write_output_mutex);
                                 } else {
                                     fprintf(stderr, "Failed to encode audio!\n");
                                 }
@@ -2200,27 +2203,27 @@ int main(int argc, char **argv) {
                     if(got_audio_data) {
                         // TODO: Instead of converting audio, get float audio from alsa. Or does alsa do conversion internally to get this format?
                         if(needs_audio_conversion)
-                            swr_convert(swr, &audio_track.frame->data[0], audio_track.frame->nb_samples, (const uint8_t**)&sound_buffer, audio_track.codec_context->frame_size);
+                            swr_convert(swr, &audio_device.frame->data[0], audio_device.frame->nb_samples, (const uint8_t**)&sound_buffer, audio_track.codec_context->frame_size);
                         else
-                            audio_track.frame->data[0] = (uint8_t*)sound_buffer;
+                            audio_device.frame->data[0] = (uint8_t*)sound_buffer;
 
-                        audio_track.frame->pts = (this_audio_frame_time - record_start_time) * (double)AV_TIME_BASE;
-                        const bool same_pts = audio_track.frame->pts == prev_pts;
-                        prev_pts = audio_track.frame->pts;
+                        audio_device.frame->pts = (this_audio_frame_time - record_start_time) * (double)AV_TIME_BASE;
+                        const bool same_pts = audio_device.frame->pts == prev_pts;
+                        prev_pts = audio_device.frame->pts;
                         if(same_pts)
                             continue;
 
                         if(audio_track.graph) {
                             std::lock_guard<std::mutex> lock(audio_filter_mutex);
                             // TODO: av_buffersrc_add_frame
-                            if(av_buffersrc_write_frame(audio_device.src_filter_ctx, audio_track.frame) < 0) {
+                            if(av_buffersrc_write_frame(audio_device.src_filter_ctx, audio_device.frame) < 0) {
                                 fprintf(stderr, "Error: failed to add audio frame to filter\n");
                             }
                         } else {
-                            ret = avcodec_send_frame(audio_track.codec_context, audio_track.frame);
+                            ret = avcodec_send_frame(audio_track.codec_context, audio_device.frame);
                             if(ret >= 0) {
                                 // TODO: Move to separate thread because this could write to network (for example when livestreaming)
-                                receive_frames(audio_track.codec_context, audio_track.stream_index, audio_track.stream, audio_track.frame->pts, av_format_context, record_start_time, frame_data_queue, replay_buffer_size_secs, frames_erased, write_output_mutex);
+                                receive_frames(audio_track.codec_context, audio_track.stream_index, audio_track.stream, audio_device.frame->pts, av_format_context, record_start_time, frame_data_queue, replay_buffer_size_secs, frames_erased, write_output_mutex);
                             } else {
                                 fprintf(stderr, "Failed to encode audio!\n");
                             }