From 82e37200849935beac7213877b526aeaf2170274 Mon Sep 17 00:00:00 2001 From: dec05eba Date: Fri, 31 Mar 2023 01:11:12 +0200 Subject: [PATCH] Install coolbits if using nvidia, add preserve video memory install script --- 88-gsr-coolbits.conf | 2 +- README.md | 10 ++++- TODO | 5 +++ gsr-nvidia.conf | 1 + include/xnvctrl.h | 2 + install.sh | 1 + install_coolbits.sh | 4 +- install_preserve_video_memory.sh | 8 ++++ src/capture/nvfbc.c | 31 +++++++++----- src/cuda.c | 10 ++--- src/main.cpp | 3 +- src/overclock.c | 72 +++++++++++++++++++++++++------- 12 files changed, 113 insertions(+), 36 deletions(-) create mode 100644 gsr-nvidia.conf create mode 100755 install_preserve_video_memory.sh diff --git a/88-gsr-coolbits.conf b/88-gsr-coolbits.conf index be665f1..2a9055c 100644 --- a/88-gsr-coolbits.conf +++ b/88-gsr-coolbits.conf @@ -1,5 +1,5 @@ Section "Device" - Identifier "Device0" + Identifier "NvidiaCoolbits" Driver "nvidia" Option "Coolbits" "12" EndSection diff --git a/README.md b/README.md index 929a8ed..6f6568b 100644 --- a/README.md +++ b/README.md @@ -15,12 +15,14 @@ and then rebooting your laptop. screen-direct capture has been temporary disabled as it causes issues with stuttering. This might be a nvfbc bug. # Performance +On a system with a i5 4690k CPU and a GTX 1080 GPU:\ When recording Legend of Zelda Breath of the Wild at 4k, fps drops from 30 to 7 when using OBS Studio + nvenc, however when using this screen recorder the fps remains at 30.\ When recording GTA V at 4k on highest settings, fps drops from 60 to 23 when using obs-nvfbc + nvenc, however when using this screen recorder the fps only drops to 58. The quality is also much better when using gpu-screen-recorder.\ -It is recommended to save the video to a SSD because of the large file size, which a slow HDD might not be fast enough to handle. +It is recommended to save the video to a SSD because of the large file size, which a slow HDD might not be fast enough to handle.\ +Note that if you have a very powerful CPU and a not so powerful GPU and play a game that is bottlenecked by your GPU and barely uses your CPU then a CPU based screen recording (such as OBS with libx264 instead of nvenc) might perform slightly better than GPU Screen Recorder. At least on NVIDIA. ## Note about optimal performance on NVIDIA NVIDIA driver has a "feature" (read: bug) where it will downclock memory transfer rate when a program uses cuda, such as GPU Screen Recorder. To work around this bug, GPU Screen Recorder can overclock your GPU memory transfer rate to it's normal optimal level.\ -To enable overclocking for optimal performance use the `-oc` option when running GPU Screen Recorder. You also need to have "Coolbits" NVIDIA X setting set to "12" to enable overclocking. You can automatically add this option if you run `install_coolbits.sh` and then reboot your computer.\ +To enable overclocking for optimal performance use the `-oc` option when running GPU Screen Recorder. You also need to have "Coolbits" NVIDIA X setting set to "12" to enable overclocking. You can automatically add this option if you run `install_coolbits.sh` and then reboot your computer. This script is automatically run if you are using NVIDIA and run `install.sh`.\ Note that this only works when Xorg server is running as root, and using this option will only give you a performance boost if the game you are recording is bottlenecked by your GPU.\ Note! use at your own risk! @@ -53,6 +55,10 @@ There is also a gui for the gpu-screen-recorder called [gpu-screen-recorder-gtk] Run the script `scripts/start-replay.sh` to start replay and then `scripts/save-replay.sh` to save a replay and `scripts/stop-replay.sh` to stop the replay. The videos are saved to `$HOME/Videos`. You can use these scripts to start replay at system startup if you add `scripts/start-replay.sh` to startup (this can be done differently depending on your desktop environment / window manager) and then go into hotkey settings on your system and choose a hotkey to run the script `scripts/save-replay.sh`. Modify `scripts/start-replay.sh` if you want to use other replay options. +## Issues +### NVIDIA +Nvidia drivers have an issue where CUDA breaks if CUDA is running when suspend/hibernation happens, and it remains broken until you reload the nvidia driver. To fix this, either disable suspend or tell the NVIDIA driver to preserve video memory on suspend/hibernate by using the `NVreg_PreserveVideoMemoryAllocations=1` option. You can run `install_preserve_video_memory.sh` to automatically add that option to your system. + # Demo [![Click here to watch a demo video on youtube](https://img.youtube.com/vi/n5tm0g01n6A/0.jpg)](https://www.youtube.com/watch?v=n5tm0g01n6A) diff --git a/TODO b/TODO index 12de328..7f330f3 100644 --- a/TODO +++ b/TODO @@ -28,3 +28,8 @@ Test different combinations of switchable graphics. Intel hybrid mode (running i https://web.archive.org/web/20210306020203/https://forums.developer.nvidia.com/t/performance-power-management-problem-on-shared-vgpu/161986 https://djdallmann.github.io/GamingPCSetup/CONTENT/RESEARCH/FINDINGS/registrykeys_displayadapter_class_4d36e968-e325-11ce-bfc1-08002be10318.txt + +The video output will be black if if the system is suspended on nvidia and NVreg_PreserveVideoMemoryAllocations is not set to 1. This happens because I think that the driver invalidates textures/cuda buffers? To fix this we could try and recreate gsr capture when gsr_capture_capture fails (with timeout to retry again). + +NVreg_RegistryDwords. +Restore nvfbc screen recording on monitor reconfiguration. \ No newline at end of file diff --git a/gsr-nvidia.conf b/gsr-nvidia.conf new file mode 100644 index 0000000..10cbf7d --- /dev/null +++ b/gsr-nvidia.conf @@ -0,0 +1 @@ +options nvidia NVreg_PreserveVideoMemoryAllocations=1 diff --git a/include/xnvctrl.h b/include/xnvctrl.h index 8e026c4..33fc442 100644 --- a/include/xnvctrl.h +++ b/include/xnvctrl.h @@ -4,7 +4,9 @@ #include #include +#define NV_CTRL_GPU_NVCLOCK_OFFSET 409 #define NV_CTRL_GPU_MEM_TRANSFER_RATE_OFFSET 410 +#define NV_CTRL_GPU_NVCLOCK_OFFSET_ALL_PERFORMANCE_LEVELS 424 #define NV_CTRL_GPU_MEM_TRANSFER_RATE_OFFSET_ALL_PERFORMANCE_LEVELS 425 #define NV_CTRL_TARGET_TYPE_GPU 1 diff --git a/install.sh b/install.sh index 014223d..aa4c802 100755 --- a/install.sh +++ b/install.sh @@ -9,4 +9,5 @@ cd "$script_dir" install -Dm755 "gpu-screen-recorder" "/usr/local/bin/gpu-screen-recorder" install -Dm755 "gpu-screen-recorder" "/usr/bin/gpu-screen-recorder" +[ -f "/proc/driver/nvidia/version" ] && ./install_coolbits.sh echo "Successfully installed gpu-screen-recorder" diff --git a/install_coolbits.sh b/install_coolbits.sh index 0ce4c28..053e8ab 100755 --- a/install_coolbits.sh +++ b/install_coolbits.sh @@ -6,5 +6,5 @@ cd "$script_dir" [ $(id -u) -ne 0 ] && echo "You need root privileges to run the install script" && exit 1 for xorg_conf_d in "/etc/X11/xorg.conf.d" "/usr/share/X11/xorg.conf.d" "/usr/lib/X11/xorg.conf.d"; do - [ -d "$xorg_conf_d" ] && install -Dm644 "88-gsr-coolbits.conf" "$xorg_conf_d/88-gsr-coolbits.conf" -done \ No newline at end of file + [ -d "$xorg_conf_d" ] && install -Dm644 "88-gsr-coolbits.conf" "$xorg_conf_d/88-gsr-coolbits.conf" && exit 0 +done diff --git a/install_preserve_video_memory.sh b/install_preserve_video_memory.sh new file mode 100755 index 0000000..c5cf658 --- /dev/null +++ b/install_preserve_video_memory.sh @@ -0,0 +1,8 @@ +#!/bin/sh + +script_dir=$(dirname "$0") +cd "$script_dir" + +[ $(id -u) -ne 0 ] && echo "You need root privileges to run the install script" && exit 1 + +install -Dm644 gsr-nvidia.conf /etc/modprobe.d/gsr-nvidia.conf diff --git a/src/capture/nvfbc.c b/src/capture/nvfbc.c index c1c998b..d538b5b 100644 --- a/src/capture/nvfbc.c +++ b/src/capture/nvfbc.c @@ -20,6 +20,7 @@ typedef struct { PNVFBCCREATEINSTANCE nv_fbc_create_instance; NVFBC_API_FUNCTION_LIST nv_fbc_function_list; bool fbc_handle_created; + bool capture_session_created; gsr_cuda cuda; bool frame_initialized; @@ -225,9 +226,9 @@ static int gsr_capture_nvfbc_start(gsr_capture *cap, AVCodecContext *video_codec NVFBCSTATUS status; NVFBC_TRACKING_TYPE tracking_type; - bool capture_session_created = false; uint32_t output_id = 0; cap_nvfbc->fbc_handle_created = false; + cap_nvfbc->capture_session_created = false; NVFBC_CREATE_HANDLE_PARAMS create_params; memset(&create_params, 0, sizeof(create_params)); @@ -295,6 +296,7 @@ static int gsr_capture_nvfbc_start(gsr_capture *cap, AVCodecContext *video_codec create_capture_params.dwSamplingRateMs = 1000u / ((uint32_t)cap_nvfbc->params.fps + 1); create_capture_params.bAllowDirectCapture = direct_capture ? NVFBC_TRUE : NVFBC_FALSE; create_capture_params.bPushModel = direct_capture ? NVFBC_TRUE : NVFBC_FALSE; + //create_capture_params.bDisableAutoModesetRecovery = true; // TODO: if(tracking_type == NVFBC_TRACKING_OUTPUT) create_capture_params.dwOutputId = output_id; @@ -303,7 +305,7 @@ static int gsr_capture_nvfbc_start(gsr_capture *cap, AVCodecContext *video_codec fprintf(stderr, "gsr error: gsr_capture_nvfbc_start failed: %s\n", cap_nvfbc->nv_fbc_function_list.nvFBCGetLastErrorStr(cap_nvfbc->nv_fbc_handle)); goto error_cleanup; } - capture_session_created = true; + cap_nvfbc->capture_session_created = true; NVFBC_TOCUDA_SETUP_PARAMS setup_params; memset(&setup_params, 0, sizeof(setup_params)); @@ -331,11 +333,12 @@ static int gsr_capture_nvfbc_start(gsr_capture *cap, AVCodecContext *video_codec error_cleanup: if(cap_nvfbc->fbc_handle_created) { - if(capture_session_created) { + if(cap_nvfbc->capture_session_created) { NVFBC_DESTROY_CAPTURE_SESSION_PARAMS destroy_capture_params; memset(&destroy_capture_params, 0, sizeof(destroy_capture_params)); destroy_capture_params.dwVersion = NVFBC_DESTROY_CAPTURE_SESSION_PARAMS_VER; cap_nvfbc->nv_fbc_function_list.nvFBCDestroyCaptureSession(cap_nvfbc->nv_fbc_handle, &destroy_capture_params); + cap_nvfbc->capture_session_created = false; } NVFBC_DESTROY_HANDLE_PARAMS destroy_params; @@ -357,15 +360,21 @@ static int gsr_capture_nvfbc_start(gsr_capture *cap, AVCodecContext *video_codec static void gsr_capture_nvfbc_destroy_session(gsr_capture *cap) { gsr_capture_nvfbc *cap_nvfbc = cap->priv; - NVFBC_DESTROY_CAPTURE_SESSION_PARAMS destroy_capture_params; - memset(&destroy_capture_params, 0, sizeof(destroy_capture_params)); - destroy_capture_params.dwVersion = NVFBC_DESTROY_CAPTURE_SESSION_PARAMS_VER; - cap_nvfbc->nv_fbc_function_list.nvFBCDestroyCaptureSession(cap_nvfbc->nv_fbc_handle, &destroy_capture_params); + if(cap_nvfbc->fbc_handle_created) { + if(cap_nvfbc->capture_session_created) { + NVFBC_DESTROY_CAPTURE_SESSION_PARAMS destroy_capture_params; + memset(&destroy_capture_params, 0, sizeof(destroy_capture_params)); + destroy_capture_params.dwVersion = NVFBC_DESTROY_CAPTURE_SESSION_PARAMS_VER; + cap_nvfbc->nv_fbc_function_list.nvFBCDestroyCaptureSession(cap_nvfbc->nv_fbc_handle, &destroy_capture_params); + cap_nvfbc->capture_session_created = false; + } - NVFBC_DESTROY_HANDLE_PARAMS destroy_params; - memset(&destroy_params, 0, sizeof(destroy_params)); - destroy_params.dwVersion = NVFBC_DESTROY_HANDLE_PARAMS_VER; - cap_nvfbc->nv_fbc_function_list.nvFBCDestroyHandle(cap_nvfbc->nv_fbc_handle, &destroy_params); + NVFBC_DESTROY_HANDLE_PARAMS destroy_params; + memset(&destroy_params, 0, sizeof(destroy_params)); + destroy_params.dwVersion = NVFBC_DESTROY_HANDLE_PARAMS_VER; + cap_nvfbc->nv_fbc_function_list.nvFBCDestroyHandle(cap_nvfbc->nv_fbc_handle, &destroy_params); + cap_nvfbc->fbc_handle_created = false; + } cap_nvfbc->nv_fbc_handle = 0; } diff --git a/src/cuda.c b/src/cuda.c index 470747b..ea14e8b 100644 --- a/src/cuda.c +++ b/src/cuda.c @@ -94,6 +94,11 @@ bool gsr_cuda_load(gsr_cuda *self, Display *display, bool do_overclock) { } void gsr_cuda_unload(gsr_cuda *self) { + if(self->do_overclock && self->overclock.xnvctrl.library) { + gsr_overclock_stop(&self->overclock); + gsr_overclock_unload(&self->overclock); + } + if(self->library) { if(self->cu_ctx) { self->cuCtxDestroy_v2(self->cu_ctx); @@ -102,10 +107,5 @@ void gsr_cuda_unload(gsr_cuda *self) { dlclose(self->library); } - if(self->do_overclock && self->overclock.xnvctrl.library) { - gsr_overclock_stop(&self->overclock); - gsr_overclock_unload(&self->overclock); - } - memset(self, 0, sizeof(gsr_cuda)); } diff --git a/src/main.cpp b/src/main.cpp index 36fcfe5..4cd8682 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -659,7 +659,7 @@ static void open_video(AVCodecContext *codec_context, VideoQuality video_quality } static void usage() { - fprintf(stderr, "usage: gpu-screen-recorder -w [-c ] [-s WxH] -f [-a ...] [-q ] [-r ] [-k h264|h265] [-ac aac|opus|flac] [-oc yes|no] [-o ]\n"); + fprintf(stderr, "usage: gpu-screen-recorder -w [-c ] [-s WxH] -f [-a ] [-q ] [-r ] [-k h264|h265] [-ac aac|opus|flac] [-oc yes|no] [-o ]\n"); fprintf(stderr, "\n"); fprintf(stderr, "OPTIONS:\n"); fprintf(stderr, " -w Window to record, a display, \"screen\", \"screen-direct\", \"screen-direct-force\" or \"focused\".\n"); @@ -2005,6 +2005,7 @@ int main(int argc, char **argv) { frame_time_overflow = std::min(frame_time_overflow, target_fps); frame_timer_start = time_now - frame_time_overflow; gsr_capture_capture(capture, frame); + std::lock_guard lock(video_frame_mutex); if(latest_video_frame) { av_frame_free(&latest_video_frame); diff --git a/src/overclock.c b/src/overclock.c index 8517876..7c0faad 100644 --- a/src/overclock.c +++ b/src/overclock.c @@ -8,10 +8,12 @@ // So to get around this we overclock memory transfer rate (maybe this should also be done for graphics clock?) to the best performance level while GPU Screen Recorder is running. // TODO: Does it always drop to performance level 2? -// TODO: Also do the same for graphics clock and graphics memory? + +static int min_int(int a, int b) { + return a < b ? a : b; +} // Fields are 0 if not set - typedef struct { int perf; @@ -48,31 +50,56 @@ static void split_by_delimiter(const char *str, size_t size, char delimiter, spl } } +typedef enum { + NVCTRL_GPU_NVCLOCK, + NVCTRL_ATTRIB_GPU_MEM_TRANSFER_RATE, +} NvCTRLAttributeType; + +static unsigned int attribute_type_to_attribute_param(NvCTRLAttributeType attribute_type) { + switch(attribute_type) { + case NVCTRL_GPU_NVCLOCK: + return NV_CTRL_GPU_NVCLOCK_OFFSET; + case NVCTRL_ATTRIB_GPU_MEM_TRANSFER_RATE: + return NV_CTRL_GPU_MEM_TRANSFER_RATE_OFFSET; + } + return 0; +} + +static unsigned int attribute_type_to_attribute_param_all_levels(NvCTRLAttributeType attribute_type) { + switch(attribute_type) { + case NVCTRL_GPU_NVCLOCK: + return NV_CTRL_GPU_NVCLOCK_OFFSET_ALL_PERFORMANCE_LEVELS; + case NVCTRL_ATTRIB_GPU_MEM_TRANSFER_RATE: + return NV_CTRL_GPU_MEM_TRANSFER_RATE_OFFSET_ALL_PERFORMANCE_LEVELS; + } + return 0; +} + // Returns 0 on error -static int xnvctrl_get_memory_transfer_rate_max(gsr_xnvctrl *xnvctrl, const NVCTRLPerformanceLevelQuery *query) { +static int xnvctrl_get_attribute_max_value(gsr_xnvctrl *xnvctrl, const NVCTRLPerformanceLevelQuery *query, NvCTRLAttributeType attribute_type) { NVCTRLAttributeValidValuesRec valid; - if(xnvctrl->XNVCTRLQueryValidTargetAttributeValues(xnvctrl->display, NV_CTRL_TARGET_TYPE_GPU, 0, 0, NV_CTRL_GPU_MEM_TRANSFER_RATE_OFFSET_ALL_PERFORMANCE_LEVELS, &valid)) { + if(xnvctrl->XNVCTRLQueryValidTargetAttributeValues(xnvctrl->display, NV_CTRL_TARGET_TYPE_GPU, 0, 0, attribute_type_to_attribute_param_all_levels(attribute_type), &valid)) { return valid.u.range.max; } - if(query->num_performance_levels > 0 && xnvctrl->XNVCTRLQueryValidTargetAttributeValues(xnvctrl->display, NV_CTRL_TARGET_TYPE_GPU, 0, query->num_performance_levels - 1, NV_CTRL_GPU_MEM_TRANSFER_RATE_OFFSET, &valid)) { + if(query->num_performance_levels > 0 && xnvctrl->XNVCTRLQueryValidTargetAttributeValues(xnvctrl->display, NV_CTRL_TARGET_TYPE_GPU, 0, query->num_performance_levels - 1, attribute_type_to_attribute_param(attribute_type), &valid)) { return valid.u.range.max; } return 0; } -static bool xnvctrl_set_memory_transfer_rate_offset(gsr_xnvctrl *xnvctrl, int num_performance_levels, int offset) { +static bool xnvctrl_set_attribute_offset(gsr_xnvctrl *xnvctrl, int num_performance_levels, int offset, NvCTRLAttributeType attribute_type) { bool success = false; // NV_CTRL_GPU_MEM_TRANSFER_RATE_OFFSET_ALL_PERFORMANCE_LEVELS works (or at least used to?) without Xorg running as root // so we try that first. NV_CTRL_GPU_MEM_TRANSFER_RATE_OFFSET_ALL_PERFORMANCE_LEVELS also only works with GTX 1000+. // TODO: Reverse engineer NVIDIA Xorg driver so we can set this always without root access. - if(xnvctrl->XNVCTRLSetTargetAttributeAndGetStatus(xnvctrl->display, NV_CTRL_TARGET_TYPE_GPU, 0, 0, NV_CTRL_GPU_MEM_TRANSFER_RATE_OFFSET_ALL_PERFORMANCE_LEVELS, offset)) + if(xnvctrl->XNVCTRLSetTargetAttributeAndGetStatus(xnvctrl->display, NV_CTRL_TARGET_TYPE_GPU, 0, 0, attribute_type_to_attribute_param_all_levels(attribute_type), offset)) success = true; for(int i = 0; i < num_performance_levels; ++i) { - success |= xnvctrl->XNVCTRLSetTargetAttributeAndGetStatus(xnvctrl->display, NV_CTRL_TARGET_TYPE_GPU, 0, i, NV_CTRL_GPU_MEM_TRANSFER_RATE_OFFSET, offset); + success |= xnvctrl->XNVCTRLSetTargetAttributeAndGetStatus(xnvctrl->display, NV_CTRL_TARGET_TYPE_GPU, 0, i, attribute_type_to_attribute_param(attribute_type), offset); } return success; @@ -207,21 +234,38 @@ bool gsr_overclock_start(gsr_overclock *self) { } self->num_performance_levels = query.num_performance_levels; - int target_transfer_rate_offset = xnvctrl_get_memory_transfer_rate_max(&self->xnvctrl, &query) / 2; - if(query.num_performance_levels > 3) { + int target_transfer_rate_offset = xnvctrl_get_attribute_max_value(&self->xnvctrl, &query, NVCTRL_ATTRIB_GPU_MEM_TRANSFER_RATE) / 2; // Divide by 2 just to be safe that we dont set it too high + if(query.num_performance_levels > 2) { const int transfer_rate_max_diff = query.performance_level[query.num_performance_levels - 1].mem_transfer_rate_max - query.performance_level[2].mem_transfer_rate_max; - if(transfer_rate_max_diff > 0 && transfer_rate_max_diff < target_transfer_rate_offset) - target_transfer_rate_offset = transfer_rate_max_diff; + target_transfer_rate_offset = min_int(target_transfer_rate_offset, transfer_rate_max_diff); } - if(xnvctrl_set_memory_transfer_rate_offset(&self->xnvctrl, self->num_performance_levels, target_transfer_rate_offset)) { + if(xnvctrl_set_attribute_offset(&self->xnvctrl, self->num_performance_levels, target_transfer_rate_offset, NVCTRL_ATTRIB_GPU_MEM_TRANSFER_RATE)) { fprintf(stderr, "gsr info: gsr_overclock_start: sucessfully set memory transfer rate offset to %d\n", target_transfer_rate_offset); } else { fprintf(stderr, "gsr info: gsr_overclock_start: failed to overclock memory transfer rate offset to %d\n", target_transfer_rate_offset); } + + + // TODO: Enable. Crashes on my system (gtx 1080) so it's disabled for now. Seems to crash even if graphics clock is increasd by 1, let alone 1200 + /* + int target_nv_clock_offset = xnvctrl_get_attribute_max_value(&self->xnvctrl, &query, NVCTRL_GPU_NVCLOCK) / 2; // Divide by 2 just to be safe that we dont set it too high + if(query.num_performance_levels > 2) { + const int nv_clock_max_diff = query.performance_level[query.num_performance_levels - 1].nv_clock_max - query.performance_level[2].nv_clock_max; + target_nv_clock_offset = min_int(target_nv_clock_offset, nv_clock_max_diff); + } + + if(xnvctrl_set_attribute_offset(&self->xnvctrl, self->num_performance_levels, target_nv_clock_offset, NVCTRL_GPU_NVCLOCK)) { + fprintf(stderr, "gsr info: gsr_overclock_start: sucessfully set nv clock offset to %d\n", target_nv_clock_offset); + } else { + fprintf(stderr, "gsr info: gsr_overclock_start: failed to overclock nv clock offset to %d\n", target_nv_clock_offset); + } + */ + return true; } void gsr_overclock_stop(gsr_overclock *self) { - xnvctrl_set_memory_transfer_rate_offset(&self->xnvctrl, self->num_performance_levels, 0); + xnvctrl_set_attribute_offset(&self->xnvctrl, self->num_performance_levels, 0, NVCTRL_ATTRIB_GPU_MEM_TRANSFER_RATE); + //xnvctrl_set_attribute_offset(&self->xnvctrl, self->num_performance_levels, 0, NVCTRL_GPU_NVCLOCK); }