Remove unecessary cuda memcpy when using nvfbc

2022-04-04 06:13:52 +02:00
parent c43fa5e4ee
commit 6a01677e23
3 changed files with 41 additions and 69 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -7,6 +7,5 @@ tests/compile_commands.json
 .clangd/
 .cache/

-main.o
-sound.o
+*.o
 gpu-screen-recorder
--- a/6
+++ b/6
@@ -1,11 +1,7 @@
 Check for reparent.
 Only add window to list if its the window is a topmost window.
-Use nvEncoder api directly? maybe with this we could copy the window opengl texture directly to the gpu which doesn't work right now for some reason.
-    Right now we are required to copy the opengl texture to another opengl texture first.
-    nvEncRegisterResource allows registering an opengl texture directly with NV_ENC_INPUT_RESOURCE_OPENGL_TEX and using that directly in the encoding.
 Load cuda at runtime with dlopen.
 Track window damages and only update then. That is better for output file size.
-Remove cuda to cuda copy when using nvFBC if possible. ffmpeg is getting in the way.
 Getting the texture of a window when using a compositor is an nvidia specific limitation. When gpu-screen-recorder supports other gpus then this can be ignored.
 Remove dependency on glfw (and glew?).
-Quickly changing workspace and back while recording under i3 breaks the screen recorder. The resize is triggered and it fails to recreate texture (fail to get texture size, texture id probably == 0).
+Quickly changing workspace and back while recording under i3 breaks the screen recorder. i3 probably unmaps windows in other workspaces.
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -310,23 +310,6 @@ static bool recreate_window_pixmap(Display *dpy, Window window_id,
    return pixmap.texture_id != 0 && pixmap.target_texture_id != 0;
 }

-std::vector<std::string> get_hardware_acceleration_device_names() {
-    int iGpu = 0;
-    int nGpu = 0;
-    cuDeviceGetCount(&nGpu);
-    if (iGpu < 0 || iGpu >= nGpu) {
-        fprintf(stderr, "Error: failed...\n");
-        return {};
-    }
-
-    CUdevice cuDevice = 0;
-    cuDeviceGet(&cuDevice, iGpu);
-    char deviceName[80];
-    cuDeviceGetName(deviceName, sizeof(deviceName), cuDevice);
-    fprintf(stderr, "device name: %s\n", deviceName);
-    return {deviceName};
-}
-
 // |stream| is only required for non-replay mode
 static void receive_frames(AVCodecContext *av_codec_context, int stream_index, AVStream *stream, AVFrame *frame,
                           AVFormatContext *av_format_context,
@@ -438,7 +421,6 @@ static AVCodecContext *create_video_codec_context(AVFormatContext *av_format_con

    assert(codec->type == AVMEDIA_TYPE_VIDEO);
    codec_context->codec_id = codec->id;
-    fprintf(stderr, "codec id: %d\n", codec->id);
    codec_context->width = record_width & ~1;
    codec_context->height = record_height & ~1;
 	codec_context->bit_rate = 7500000 + (codec_context->width * codec_context->height) / 2;
@@ -464,6 +446,7 @@ static AVCodecContext *create_video_codec_context(AVFormatContext *av_format_con
            //av_opt_set(codec_context->priv_data, "preset", "slow", 0);
            //av_opt_set(codec_context->priv_data, "profile", "high", 0);
            //codec_context->profile = FF_PROFILE_H264_HIGH;
+            av_opt_set(codec_context->priv_data, "preset", "p4", 0);
            break;
        case VideoQuality::HIGH:
            codec_context->qmin = 12;
@@ -471,6 +454,7 @@ static AVCodecContext *create_video_codec_context(AVFormatContext *av_format_con
            //av_opt_set(codec_context->priv_data, "preset", "slow", 0);
            //av_opt_set(codec_context->priv_data, "profile", "high", 0);
            //codec_context->profile = FF_PROFILE_H264_HIGH;
+            av_opt_set(codec_context->priv_data, "preset", "p6", 0);
            break;
        case VideoQuality::ULTRA:
 	        codec_context->bit_rate = 10000000 + (codec_context->width * codec_context->height) / 2;
@@ -479,6 +463,7 @@ static AVCodecContext *create_video_codec_context(AVFormatContext *av_format_con
            //av_opt_set(codec_context->priv_data, "preset", "veryslow", 0);
            //av_opt_set(codec_context->priv_data, "profile", "high", 0);
            //codec_context->profile = FF_PROFILE_H264_HIGH;
+            av_opt_set(codec_context->priv_data, "preset", "p7", 0);
            break;
    }
    if (codec_context->codec_id == AV_CODEC_ID_MPEG1VIDEO)
@@ -486,6 +471,7 @@ static AVCodecContext *create_video_codec_context(AVFormatContext *av_format_con

    // stream->time_base = codec_context->time_base;
    // codec_context->ticks_per_frame = 30;
+    av_opt_set(codec_context->priv_data, "tune", "hq", 0);

    // Some formats want stream headers to be seperate
    if (av_format_context->oformat->flags & AVFMT_GLOBALHEADER)
@@ -524,24 +510,20 @@ static AVFrame* open_audio(AVCodecContext *audio_codec_context) {

 static void open_video(AVCodecContext *codec_context,
                       WindowPixmap &window_pixmap, AVBufferRef **device_ctx,
-                       CUgraphicsResource *cuda_graphics_resource) {
+                       CUgraphicsResource *cuda_graphics_resource, CUcontext cuda_context) {
    int ret;

-    std::vector<std::string> hardware_accelerated_devices =
-        get_hardware_acceleration_device_names();
-    if (hardware_accelerated_devices.empty()) {
-        fprintf(
-            stderr,
-            "Error: No hardware accelerated device was found on your system\n");
+    *device_ctx = av_hwdevice_ctx_alloc(AV_HWDEVICE_TYPE_CUDA);
+    if(!*device_ctx) {
+        fprintf(stderr, "Error: Failed to create hardware device context\n");
        exit(1);
    }

-    if (av_hwdevice_ctx_create(device_ctx, AV_HWDEVICE_TYPE_CUDA,
-                               hardware_accelerated_devices[0].c_str(), NULL,
-                               0) < 0) {
-        fprintf(stderr,
-                "Error: Failed to create hardware device context for gpu: %s\n",
-                hardware_accelerated_devices[0].c_str());
+    AVHWDeviceContext *hw_device_context = (AVHWDeviceContext *)(*device_ctx)->data;
+    AVCUDADeviceContext *cuda_device_context = (AVCUDADeviceContext *)hw_device_context->hwctx;
+    cuda_device_context->cuda_ctx = cuda_context;
+    if(av_hwdevice_ctx_init(*device_ctx) < 0) {
+        fprintf(stderr, "Error: Failed to create hardware device context\n");
        exit(1);
    }

@@ -576,21 +558,11 @@ static void open_video(AVCodecContext *codec_context,
        exit(1);
    }

-    AVHWDeviceContext *hw_device_context =
-        (AVHWDeviceContext *)(*device_ctx)->data;
-    AVCUDADeviceContext *cuda_device_context =
-        (AVCUDADeviceContext *)hw_device_context->hwctx;
-    CUcontext *cuda_context = &(cuda_device_context->cuda_ctx);
-    if (!cuda_context) {
-        fprintf(stderr, "Error: No cuda context\n");
-        exit(1);
-    }
-
    if(window_pixmap.target_texture_id != 0) {
        CUresult res;
        CUcontext old_ctx;
        res = cuCtxPopCurrent(&old_ctx);
-        res = cuCtxPushCurrent(*cuda_context);
+        res = cuCtxPushCurrent(cuda_context);
        res = cuGraphicsGLRegisterImage(
            cuda_graphics_resource, window_pixmap.target_texture_id, GL_TEXTURE_2D,
            CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY);
@@ -930,21 +902,34 @@ int main(int argc, char **argv) {

    res = cuInit(0);
    if(res != CUDA_SUCCESS) {
-        fprintf(stderr, "Error: cuInit failed (result: %d)\n", res);
-        return {};
+        const char *err_str;
+        cuGetErrorString(res, &err_str);
+        fprintf(stderr, "Error: cuInit failed, error %s (result: %d)\n", err_str, res);
+        return 1;
+    }
+
+    int nGpu = 0;
+    cuDeviceGetCount(&nGpu);
+    if (nGpu <= 0) {
+        fprintf(stderr, "Error: no cuda supported devices found\n");
+        return 1;
    }

    CUdevice cu_dev;
    res = cuDeviceGet(&cu_dev, 0);
    if(res != CUDA_SUCCESS) {
-        fprintf(stderr, "Unable to get CUDA device (result: %d)\n", res);
+        const char *err_str;
+        cuGetErrorString(res, &err_str);
+        fprintf(stderr, "Error: unable to get CUDA device, error: %s (result: %d)\n", err_str, res);
        return 1;
    }

    CUcontext cu_ctx;
    res = cuCtxCreate_v2(&cu_ctx, CU_CTX_SCHED_AUTO, cu_dev);
    if(res != CUDA_SUCCESS) {
-        fprintf(stderr, "Unable to create CUDA context (result: %d)\n", res);
+        const char *err_str;
+        cuGetErrorString(res, &err_str);
+        fprintf(stderr, "Error: unable to create CUDA context, error: %s (result: %d)\n", err_str, res);
        return 1;
    }

@@ -1124,7 +1109,7 @@ int main(int argc, char **argv) {

    AVBufferRef *device_ctx;
    CUgraphicsResource cuda_graphics_resource;
-    open_video(video_codec_context, window_pixmap, &device_ctx, &cuda_graphics_resource);
+    open_video(video_codec_context, window_pixmap, &device_ctx, &cuda_graphics_resource, cu_ctx);
    if(video_stream)
        avcodec_parameters_from_context(video_stream->codecpar, video_codec_context);

@@ -1161,16 +1146,6 @@ int main(int argc, char **argv) {
        }
    }

-    AVHWDeviceContext *hw_device_context =
-        (AVHWDeviceContext *)device_ctx->data;
-    AVCUDADeviceContext *cuda_device_context =
-        (AVCUDADeviceContext *)hw_device_context->hwctx;
-    CUcontext *cuda_context = &(cuda_device_context->cuda_ctx);
-    if (!cuda_context) {
-        fprintf(stderr, "Error: No cuda context\n");
-        exit(1);
-    }
-
    // av_frame_free(&rgb_frame);
    // avcodec_close(av_codec_context);

@@ -1195,7 +1170,7 @@ int main(int argc, char **argv) {
    CUarray mapped_array;
    if(src_window_id) {
        res = cuCtxPopCurrent(&old_ctx);
-        res = cuCtxPushCurrent(*cuda_context);
+        res = cuCtxPushCurrent(cu_ctx);

        // Get texture
        res = cuGraphicsResourceSetMapFlags(
@@ -1431,6 +1406,8 @@ int main(int argc, char **argv) {
                    // int err = glGetError();
                    // fprintf(stderr, "error: %d\n", err);

+                    // TODO: Remove this copy, which is only possible by using nvenc directly and encoding window_pixmap.target_texture_id
+
                    CUDA_MEMCPY2D memcpy_struct;
                    memcpy_struct.srcXInBytes = 0;
                    memcpy_struct.srcY = 0;
@@ -1449,11 +1426,11 @@ int main(int argc, char **argv) {

                    frame_captured = true;
                } else {
-                    uint32_t byte_size;
-                    CUdeviceptr src_cu_device_ptr;
+                    // TODO: Check when src_cu_device_ptr changes and re-register resource
+                    uint32_t byte_size = 0;
+                    CUdeviceptr src_cu_device_ptr = 0;
                    frame_captured = nv_fbc_library.capture(&src_cu_device_ptr, &byte_size);
-                    if(frame_captured)
-                        cuMemcpyDtoD((CUdeviceptr)frame->data[0], src_cu_device_ptr, byte_size);
+                    frame->data[0] = (uint8_t*)src_cu_device_ptr;
                }
                // res = cuCtxPopCurrent(&old_ctx);
            }