9 files changed, 336 insertions, 53 deletions
diff --git a/src/capture/nvfbc.c b/src/capture/nvfbc.c
index fba3321..5371d02 100644
--- a/src/capture/nvfbc.c
+++ b/src/capture/nvfbc.c
@@ -183,7 +183,7 @@ static bool ffmpeg_create_cuda_contexts(gsr_capture_nvfbc *cap_nvfbc, AVCodecCon
 
 static int gsr_capture_nvfbc_start(gsr_capture *cap, AVCodecContext *video_codec_context) {
     gsr_capture_nvfbc *cap_nvfbc = cap->priv;
-    if(!gsr_cuda_load(&cap_nvfbc->cuda))
+    if(!gsr_cuda_load(&cap_nvfbc->cuda, cap_nvfbc->params.dpy, cap_nvfbc->params.overclock))
         return -1;
 
     if(!gsr_capture_nvfbc_load_library(cap)) {
diff --git a/src/capture/xcomposite_cuda.c b/src/capture/xcomposite_cuda.c
index 877206b..fd140c6 100644
--- a/src/capture/xcomposite_cuda.c
+++ b/src/capture/xcomposite_cuda.c
@@ -222,7 +222,7 @@ static int gsr_capture_xcomposite_cuda_start(gsr_capture *cap, AVCodecContext *v
         return -1;
     }
 
-    if(!gsr_cuda_load(&cap_xcomp->cuda)) {
+    if(!gsr_cuda_load(&cap_xcomp->cuda, cap_xcomp->dpy, cap_xcomp->params.overclock)) {
         gsr_capture_xcomposite_cuda_stop(cap, video_codec_context);
         return -1;
     }
@@ -269,7 +269,8 @@ static void gsr_capture_xcomposite_cuda_stop(gsr_capture *cap, AVCodecContext *v
 
     gsr_egl_unload(&cap_xcomp->egl);
     if(cap_xcomp->dpy) {
-        XCloseDisplay(cap_xcomp->dpy);
+        // TODO: This causes a crash, why? maybe some other library dlclose xlib and that also happened to unload this???
+        //XCloseDisplay(cap_xcomp->dpy);
         cap_xcomp->dpy = NULL;
     }
 }
@@ -424,6 +425,7 @@ static int gsr_capture_xcomposite_cuda_capture(gsr_capture *cap, AVFrame *frame)
     vec2i source_size = cap_xcomp->texture_size;
 
     if(cap_xcomp->window_texture.texture_id != 0) {
+        while(cap_xcomp->egl.glGetError()) {}
         /* TODO: Remove this copy, which is only possible by using nvenc directly and encoding window_pixmap.target_texture_id */
         cap_xcomp->egl.glCopyImageSubData(
             window_texture_get_opengl_texture_id(&cap_xcomp->window_texture), GL_TEXTURE_2D, 0, source_pos.x, source_pos.y, 0,
diff --git a/src/capture/xcomposite_drm.c b/src/capture/xcomposite_drm.c
index 9fb323d..489fc45 100644
--- a/src/capture/xcomposite_drm.c
+++ b/src/capture/xcomposite_drm.c
@@ -706,6 +706,7 @@ static void gsr_capture_xcomposite_drm_tick(gsr_capture *cap, AVCodecContext *vi
         #define FOURCC_NV12 842094158
 
         if(prime.fourcc == FOURCC_NV12) { // This happens on AMD
+            while(cap_xcomp->egl.glGetError()) {}
             while(cap_xcomp->egl.eglGetError() != EGL_SUCCESS){}
 
             EGLImage images[2];
@@ -902,7 +903,8 @@ static void gsr_capture_xcomposite_drm_destroy(gsr_capture *cap, AVCodecContext
         cap->priv = NULL;
     }
     if(cap_xcomp->dpy) {
-        XCloseDisplay(cap_xcomp->dpy);
+        // TODO: This causes a crash, why? maybe some other library dlclose xlib and that also happened to unload this???
+        //XCloseDisplay(cap_xcomp->dpy);
         cap_xcomp->dpy = NULL;
     }
     free(cap);
diff --git a/src/cuda.c b/src/cuda.c
index 3076ebe..470747b 100644
--- a/src/cuda.c
+++ b/src/cuda.c
@@ -2,8 +2,9 @@
 #include "../include/library_loader.h"
 #include <string.h>
 
-bool gsr_cuda_load(gsr_cuda *self) {
+bool gsr_cuda_load(gsr_cuda *self, Display *display, bool do_overclock) {
     memset(self, 0, sizeof(gsr_cuda));
+    self->do_overclock = do_overclock;
 
     dlerror(); /* clear */
     void *lib = dlopen("libcuda.so.1", RTLD_LAZY);
@@ -76,6 +77,13 @@ bool gsr_cuda_load(gsr_cuda *self) {
         goto fail;
     }
 
+    if(self->do_overclock) {
+        if(gsr_overclock_load(&self->overclock, display))
+            gsr_overclock_start(&self->overclock);
+        else
+            fprintf(stderr, "gsr warning: gsr_cuda_load: failed to load xnvctrl, failed to overclock memory transfer rate\n");
+    }
+
     self->library = lib;
     return true;
 
@@ -91,8 +99,13 @@ void gsr_cuda_unload(gsr_cuda *self) {
             self->cuCtxDestroy_v2(self->cu_ctx);
             self->cu_ctx = 0;
         }
-
         dlclose(self->library);
-        memset(self, 0, sizeof(gsr_cuda));
     }
+
+    if(self->do_overclock && self->overclock.xnvctrl.library) {
+        gsr_overclock_stop(&self->overclock);
+        gsr_overclock_unload(&self->overclock);
+    }
+
+    memset(self, 0, sizeof(gsr_cuda));
 }
diff --git a/src/main.cpp b/src/main.cpp
index 1c6dad9..762b3cd 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -630,7 +630,7 @@ static void open_video(AVCodecContext *codec_context, VideoQuality video_quality
 }
 
 static void usage() {
-    fprintf(stderr, "usage: gpu-screen-recorder -w <window_id|monitor|focused> [-c <container_format>] [-s WxH] -f <fps> [-a <audio_input>...] [-q <quality>] [-r <replay_buffer_size_sec>] [-k h264|h265] [-ac aac|opus|flac] [-o <output_file>]\n");
+    fprintf(stderr, "usage: gpu-screen-recorder -w <window_id|monitor|focused> [-c <container_format>] [-s WxH] -f <fps> [-a <audio_input>...] [-q <quality>] [-r <replay_buffer_size_sec>] [-k h264|h265] [-ac aac|opus|flac] [-oc yes|no] [-o <output_file>]\n");
     fprintf(stderr, "OPTIONS:\n");
     fprintf(stderr, "  -w    Window to record, a display, \"screen\", \"screen-direct\", \"screen-direct-force\" or \"focused\". The display is the display (monitor) name in xrandr and if \"screen\" or \"screen-direct\" is selected then all displays are recorded. If this is \"focused\" then the currently focused window is recorded. When recording the focused window then the -s option has to be used as well.\n"
         "        \"screen-direct\"/\"screen-direct-force\" skips one texture copy for fullscreen applications so it may lead to better performance and it works with VRR monitors when recording fullscreen application but may break some applications, such as mpv in fullscreen mode. Direct mode doesn't capture cursor either. \"screen-direct-force\" is not recommended unless you use a VRR monitor because there might be driver issues that cause the video to stutter or record a black screen.\n");
@@ -644,6 +644,7 @@ static void usage() {
         " This option has be between 5 and 1200. Note that the replay buffer size will not always be precise, because of keyframes. Optional, disabled by default.\n");
     fprintf(stderr, "  -k    Video codec to use. Should be either 'auto', 'h264' or 'h265'. Defaults to 'auto' which defaults to 'h265' on nvidia unless recording at a higher resolution than 3840x2160. On AMD/Intel this defaults to 'auto' which defaults to 'h264'. Forcefully set to 'h264' if -c is 'flv'.\n");
     fprintf(stderr, "  -ac   Audio codec to use. Should be either 'aac', 'opus' or 'flac'. Defaults to 'opus' for .mp4/.mkv files, otherwise defaults to 'aac'. 'opus' and 'flac' is only supported by .mp4/.mkv files. 'opus' is recommended for best performance and smallest audio size.\n");
+    fprintf(stderr, "  -oc   Overclock memory transfer rate to the maximum performance level. This only applies to NVIDIA and exists to overcome a bug in NVIDIA driver where performance level is dropped when you record a game. Only needed if you are recording a game that is bottlenecked by GPU. Works only if your have \"Coolbits\" set to \"12\" in NVIDIA X settings, see README for more information. Obs! use at your own risk! Optional, disabled by default\n");
     fprintf(stderr, "  -o    The output file path. If omitted then the encoded data is sent to stdout. Required in replay mode (when using -r). In replay mode this has to be an existing directory instead of a file.\n");
     fprintf(stderr, "NOTES:\n");
     fprintf(stderr, "  Send signal SIGINT (Ctrl+C) to gpu-screen-recorder to stop and save the recording (when not using replay mode).\n");
@@ -1064,10 +1065,11 @@ int main(int argc, char **argv) {
         { "-o", Arg { {}, true, false } },
         { "-r", Arg { {}, true, false } },
         { "-k", Arg { {}, true, false } },
-        { "-ac", Arg { {}, true, false } }
+        { "-ac", Arg { {}, true, false } },
+        { "-oc", Arg { {}, true, false } }
     };
 
-    for(int i = 1; i < argc - 1; i += 2) {
+    for(int i = 1; i < argc; i += 2) {
         auto it = args.find(argv[i]);
         if(it == args.end()) {
             fprintf(stderr, "Invalid argument '%s'\n", argv[i]);
@@ -1079,6 +1081,11 @@ int main(int argc, char **argv) {
             usage();
         }
 
+        if(i + 1 >= argc) {
+            fprintf(stderr, "Missing value for argument '%s'\n", argv[i]);
+            usage();
+        }
+
         it->second.values.push_back(argv[i + 1]);
     }
 
@@ -1119,6 +1126,11 @@ int main(int argc, char **argv) {
         usage();
     }
 
+    const char *overclock_str = args["-oc"].value();
+    if(!overclock_str)
+        overclock_str = "no";
+    const bool overclock = strcmp(overclock_str, "yes") == 0;
+
     const Arg &audio_input_arg = args["-a"];
     const std::vector<AudioInput> audio_inputs = get_pulseaudio_inputs();
     std::vector<MergedAudioInputs> requested_audio_inputs;
@@ -1221,6 +1233,10 @@ int main(int argc, char **argv) {
         usage();
     }
 
+    vec2i region_size = { 0, 0 };
+    Window src_window_id = None;
+    bool follow_focused = false;
+
     gsr_capture *capture = nullptr;
     if(strcmp(window_str, "focused") == 0) {
         if(!screen_region) {
@@ -1228,7 +1244,6 @@ int main(int argc, char **argv) {
             usage();
         }
 
-        vec2i region_size = { 0, 0 };
         if(sscanf(screen_region, "%dx%d", &region_size.x, &region_size.y) != 2) {
             fprintf(stderr, "Error: invalid value for option -s '%s', expected a value in format WxH\n", screen_region);
             usage();
@@ -1239,38 +1254,7 @@ int main(int argc, char **argv) {
             usage();
         }
 
-        switch(gpu_inf.vendor) {
-            case GPU_VENDOR_AMD: {
-                gsr_capture_xcomposite_drm_params xcomposite_params;
-                xcomposite_params.window = 0;
-                xcomposite_params.follow_focused = true;
-                xcomposite_params.region_size = region_size;
-                capture = gsr_capture_xcomposite_drm_create(&xcomposite_params);
-                if(!capture)
-                    return 1;
-                break;
-            }
-            case GPU_VENDOR_INTEL: {
-                gsr_capture_xcomposite_drm_params xcomposite_params;
-                xcomposite_params.window = 0;
-                xcomposite_params.follow_focused = true;
-                xcomposite_params.region_size = region_size;
-                capture = gsr_capture_xcomposite_drm_create(&xcomposite_params);
-                if(!capture)
-                    return 1;
-                break;
-            }
-            case GPU_VENDOR_NVIDIA: {
-                gsr_capture_xcomposite_cuda_params xcomposite_params;
-                xcomposite_params.window = 0;
-                xcomposite_params.follow_focused = true;
-                xcomposite_params.region_size = region_size;
-                capture = gsr_capture_xcomposite_cuda_create(&xcomposite_params);
-                if(!capture)
-                    return 1;
-                break;
-            }
-        }
+        follow_focused = true;
     } else if(contains_non_hex_number(window_str)) {
         if(gpu_inf.vendor != GPU_VENDOR_NVIDIA) {
             fprintf(stderr, "Error: recording a monitor is only supported on NVIDIA right now. Record \"focused\" instead for convenient fullscreen window recording\n");
@@ -1310,23 +1294,26 @@ int main(int argc, char **argv) {
         nvfbc_params.pos = { 0, 0 };
         nvfbc_params.size = { 0, 0 };
         nvfbc_params.direct_capture = direct_capture;
+        nvfbc_params.overclock = overclock;
         capture = gsr_capture_nvfbc_create(&nvfbc_params);
         if(!capture)
             return 1;
     } else {
         errno = 0;
-        Window src_window_id = strtol(window_str, nullptr, 0);
+        src_window_id = strtol(window_str, nullptr, 0);
         if(src_window_id == None || errno == EINVAL) {
             fprintf(stderr, "Invalid window number %s\n", window_str);
             usage();
         }
+    }
 
+    if(!capture) {
         switch(gpu_inf.vendor) {
             case GPU_VENDOR_AMD: {
                 gsr_capture_xcomposite_drm_params xcomposite_params;
                 xcomposite_params.window = src_window_id;
-                xcomposite_params.follow_focused = false;
-                xcomposite_params.region_size = { 0, 0 };
+                xcomposite_params.follow_focused = follow_focused;
+                xcomposite_params.region_size = region_size;
                 capture = gsr_capture_xcomposite_drm_create(&xcomposite_params);
                 if(!capture)
                     return 1;
@@ -1335,8 +1322,8 @@ int main(int argc, char **argv) {
             case GPU_VENDOR_INTEL: {
                 gsr_capture_xcomposite_drm_params xcomposite_params;
                 xcomposite_params.window = src_window_id;
-                xcomposite_params.follow_focused = false;
-                xcomposite_params.region_size = { 0, 0 };
+                xcomposite_params.follow_focused = follow_focused;
+                xcomposite_params.region_size = region_size;
                 capture = gsr_capture_xcomposite_drm_create(&xcomposite_params);
                 if(!capture)
                     return 1;
@@ -1345,8 +1332,9 @@ int main(int argc, char **argv) {
             case GPU_VENDOR_NVIDIA: {
                 gsr_capture_xcomposite_cuda_params xcomposite_params;
                 xcomposite_params.window = src_window_id;
-                xcomposite_params.follow_focused = false;
-                xcomposite_params.region_size = { 0, 0 };
+                xcomposite_params.follow_focused = follow_focused;
+                xcomposite_params.region_size = region_size;
+                xcomposite_params.overclock = overclock;
                 capture = gsr_capture_xcomposite_cuda_create(&xcomposite_params);
                 if(!capture)
                     return 1;
@@ -1874,8 +1862,10 @@ int main(int argc, char **argv) {
 
     gsr_capture_destroy(capture, video_codec_context);
 
-    if(dpy)
-        XCloseDisplay(dpy);
+    if(dpy) {
+        // TODO: This causes a crash, why? maybe some other library dlclose xlib and that also happened to unload this???
+        //XCloseDisplay(dpy);
+    }
 
     free(empty_audio);
     return should_stop_error ? 3 : 0;
diff --git a/src/overclock.c b/src/overclock.c
new file mode 100644
index 0000000..5957f92
--- /dev/null
+++ b/src/overclock.c
@@ -0,0 +1,231 @@
+#include "../include/overclock.h"
+#include <X11/Xlib.h>
+#include <stdio.h>
+#include <string.h>
+
+// HACK!!!: When a program uses cuda (including nvenc) then the nvidia driver drops to performance level 2 (memory transfer rate is dropped and possibly graphics clock).
+// Nvidia does this because in some very extreme cases of cuda there can be memory corruption when running at max memory transfer rate.
+// So to get around this we overclock memory transfer rate (maybe this should also be done for graphics clock?) to the best performance level while GPU Screen Recorder is running.
+
+// TODO: Does it always drop to performance level 2?
+// TODO: Also do the same for graphics clock and graphics memory?
+
+// Fields are 0 if not set
+
+static min_int(int a, int b) {
+    return a < b ? a : b;
+}
+
+typedef struct {
+    int perf;
+
+    int nv_clock;
+    int nv_clock_min;
+    int nv_clock_max;
+
+    int mem_clock;
+    int mem_clock_min;
+    int mem_clock_max;
+
+    int mem_transfer_rate;
+    int mem_transfer_rate_min;
+    int mem_transfer_rate_max;
+} NVCTRLPerformanceLevel;
+
+#define MAX_PERFORMANCE_LEVELS 12
+typedef struct {
+    NVCTRLPerformanceLevel performance_level[MAX_PERFORMANCE_LEVELS];
+    int num_performance_levels;
+} NVCTRLPerformanceLevelQuery;
+
+typedef void (*split_callback)(const char *str, size_t size, void *userdata);
+static void split_by_delimiter(const char *str, size_t size, char delimiter, split_callback callback, void *userdata) {
+    const char *it = str;
+    while(it < str + size) {
+        const char *prev_it = it;
+        it = memchr(it, delimiter, (str + size) - it);
+        if(!it)
+            it = str + size;
+
+        callback(prev_it, it - prev_it, userdata);
+        it += 1; // skip delimiter
+    }
+}
+
+// Returns 0 on error
+static int xnvctrl_get_memory_transfer_rate_max(gsr_xnvctrl *xnvctrl, const NVCTRLPerformanceLevelQuery *query) {
+    NVCTRLAttributeValidValuesRec valid;
+    if(xnvctrl->XNVCTRLQueryValidTargetAttributeValues(xnvctrl->display, NV_CTRL_TARGET_TYPE_GPU, 0, 0, NV_CTRL_GPU_MEM_TRANSFER_RATE_OFFSET_ALL_PERFORMANCE_LEVELS, &valid)) {
+        return valid.u.range.max;
+    }
+
+    if(query->num_performance_levels > 0 && xnvctrl->XNVCTRLQueryValidTargetAttributeValues(xnvctrl->display, NV_CTRL_TARGET_TYPE_GPU, 0, query->num_performance_levels - 1, NV_CTRL_GPU_MEM_TRANSFER_RATE_OFFSET, &valid)) {
+        return valid.u.range.max;
+    }
+    
+    return 0;
+}
+
+static bool xnvctrl_set_memory_transfer_rate_offset(gsr_xnvctrl *xnvctrl, int num_performance_levels, int offset) {
+    bool success = false;
+
+    // NV_CTRL_GPU_MEM_TRANSFER_RATE_OFFSET_ALL_PERFORMANCE_LEVELS works (or at least used to?) without Xorg running as root
+    // so we try that first. NV_CTRL_GPU_MEM_TRANSFER_RATE_OFFSET_ALL_PERFORMANCE_LEVELS also only works with GTX 1000+.
+    // TODO: Reverse engineer NVIDIA Xorg driver so we can set this always without root access.
+    if(xnvctrl->XNVCTRLSetTargetAttributeAndGetStatus(xnvctrl->display, NV_CTRL_TARGET_TYPE_GPU, 0, 0, NV_CTRL_GPU_MEM_TRANSFER_RATE_OFFSET_ALL_PERFORMANCE_LEVELS, offset))
+        success = true;
+
+    for(int i = 0; i < num_performance_levels; ++i) {
+        success |= xnvctrl->XNVCTRLSetTargetAttributeAndGetStatus(xnvctrl->display, NV_CTRL_TARGET_TYPE_GPU, 0, i, NV_CTRL_GPU_MEM_TRANSFER_RATE_OFFSET, offset);
+    }
+
+    return success;
+}
+
+static void strip(const char **str, int *size) {
+    const char *str_d = *str;
+    int s_d = *size;
+
+    const char *start = str_d;
+    const char *end = start + s_d;
+
+    while(str_d < end) {
+        char c = *str_d;
+        if(c != ' ' && c != '\t' && c != '\n')
+            break;
+        ++str_d;
+    }
+
+    int start_offset = str_d - start;
+    while(s_d > start_offset) {
+        char c = start[s_d];
+        if(c != ' ' && c != '\t' && c != '\n')
+            break;
+        --s_d;
+    }
+
+    *str = str_d;
+    *size = s_d;
+}
+
+static void attribute_callback(const char *str, size_t size, void *userdata) {
+    if(size > 255 - 1)
+        return;
+
+    int size_i = size;
+    strip(&str, &size_i);
+
+    char attribute[255];
+    memcpy(attribute, str, size_i);
+    attribute[size_i] = '\0';
+
+    const char *sep = strchr(attribute, '=');
+    if(!sep)
+        return;
+
+    const char *attribute_name = attribute;
+    size_t attribute_name_len = sep - attribute_name;
+    const char *attribute_value_str = sep + 1;
+
+    int attribute_value = 0;
+    if(sscanf(attribute_value_str, "%d", &attribute_value) != 1)
+        return;
+
+    NVCTRLPerformanceLevel *performance_level = userdata;
+    if(attribute_name_len == 4 && memcmp(attribute_name, "perf", 4) == 0)
+        performance_level->perf = attribute_value;
+    else if(attribute_name_len == 7 && memcmp(attribute_name, "nvclock", 7) == 0)
+        performance_level->nv_clock = attribute_value;
+    else if(attribute_name_len == 10 && memcmp(attribute_name, "nvclockmin", 10) == 0)
+        performance_level->nv_clock_min = attribute_value;
+    else if(attribute_name_len == 10 && memcmp(attribute_name, "nvclockmax", 10) == 0)
+        performance_level->nv_clock_max = attribute_value;
+    else if(attribute_name_len == 8 && memcmp(attribute_name, "memclock", 8) == 0)
+        performance_level->mem_clock = attribute_value;
+    else if(attribute_name_len == 11 && memcmp(attribute_name, "memclockmin", 11) == 0)
+        performance_level->mem_clock_min = attribute_value;
+    else if(attribute_name_len == 11 && memcmp(attribute_name, "memclockmax", 11) == 0)
+        performance_level->mem_clock_max = attribute_value;
+    else if(attribute_name_len == 15 && memcmp(attribute_name, "memTransferRate", 15) == 0)
+        performance_level->mem_transfer_rate = attribute_value;
+    else if(attribute_name_len == 18 && memcmp(attribute_name, "memTransferRatemin", 18) == 0)
+        performance_level->mem_transfer_rate_min = attribute_value;
+    else if(attribute_name_len == 18 && memcmp(attribute_name, "memTransferRatemax", 18) == 0)
+        performance_level->mem_transfer_rate_max = attribute_value;
+}
+
+static void attribute_line_callback(const char *str, size_t size, void *userdata) {
+    NVCTRLPerformanceLevelQuery *query = userdata;
+    if(query->num_performance_levels >= MAX_PERFORMANCE_LEVELS)
+        return;
+
+    NVCTRLPerformanceLevel *current_performance_level = &query->performance_level[query->num_performance_levels];
+    memset(current_performance_level, 0, sizeof(NVCTRLPerformanceLevel));
+    ++query->num_performance_levels;
+    split_by_delimiter(str, size, ',', attribute_callback, current_performance_level);
+}
+
+static bool xnvctrl_get_performance_levels(gsr_xnvctrl *xnvctrl, NVCTRLPerformanceLevelQuery *query) {
+    bool success = false;
+    memset(query, 0, sizeof(NVCTRLPerformanceLevelQuery));
+
+    char *attributes = NULL;
+    if(!xnvctrl->XNVCTRLQueryTargetStringAttribute(xnvctrl->display, NV_CTRL_TARGET_TYPE_GPU, 0, 0, NV_CTRL_STRING_PERFORMANCE_MODES, &attributes)) {
+        success = false;
+        goto done;
+    }
+
+    split_by_delimiter(attributes, strlen(attributes), ';', attribute_line_callback, query);
+    success = true;
+
+    done:
+    if(attributes)
+        XFree(attributes);
+
+    return success;
+}
+
+bool gsr_overclock_load(gsr_overclock *self, Display *display) {
+    memset(self, 0, sizeof(gsr_overclock));
+    self->num_performance_levels = 0;
+
+    return gsr_xnvctrl_load(&self->xnvctrl, display);
+}
+
+void gsr_overclock_unload(gsr_overclock *self) {
+    gsr_xnvctrl_unload(&self->xnvctrl);
+}
+
+bool gsr_overclock_start(gsr_overclock *self) {
+    int basep = 0;
+    int errorp = 0;
+    if(!self->xnvctrl.XNVCTRLQueryExtension(self->xnvctrl.display, &basep, &errorp)) {
+        fprintf(stderr, "gsr warning: gsr_overclock_start: xnvctrl is not supported on your system, failed to overclock memory transfer rate\n");
+        return false;
+    }
+
+    NVCTRLPerformanceLevelQuery query;
+    if(!xnvctrl_get_performance_levels(&self->xnvctrl, &query) || query.num_performance_levels == 0) {
+        fprintf(stderr, "gsr warning: gsr_overclock_start: failed to get performance levels for overclocking\n");
+        return false;
+    }
+    self->num_performance_levels = query.num_performance_levels;
+
+    int target_transfer_rate_offset = xnvctrl_get_memory_transfer_rate_max(&self->xnvctrl, &query) / 2;
+    if(query.num_performance_levels > 3) {
+        const int transfer_rate_max_diff = query.performance_level[query.num_performance_levels - 1].mem_transfer_rate_max - query.performance_level[2].mem_transfer_rate_max;
+        if(transfer_rate_max_diff > 0 && transfer_rate_max_diff < target_transfer_rate_offset)
+            target_transfer_rate_offset = transfer_rate_max_diff;
+    }
+
+    if(xnvctrl_set_memory_transfer_rate_offset(&self->xnvctrl, self->num_performance_levels, target_transfer_rate_offset)) {
+        fprintf(stderr, "gsr info: gsr_overclock_start: sucessfully set memory transfer rate offset to %d\n", target_transfer_rate_offset);
+    } else {
+        fprintf(stderr, "gsr info: gsr_overclock_start: failed to overclock memory transfer rate offset to %d\n", target_transfer_rate_offset);
+    }
+    return true;
+}
+
+void gsr_overclock_stop(gsr_overclock *self) {
+    xnvctrl_set_memory_transfer_rate_offset(&self->xnvctrl, self->num_performance_levels, 0);
+}
diff --git a/src/vaapi.c b/src/vaapi.c
index bb1b1fd..93ef797 100644
--- a/src/vaapi.c
+++ b/src/vaapi.c
@@ -20,7 +20,7 @@ bool gsr_vaapi_load(gsr_vaapi *self) {
     };
 
     if(!dlsym_load_list(lib, required_dlsym)) {
-        fprintf(stderr, "gsr error: gsr_vaapi_load failed: missing required symbols in libcuda.so/libcuda.so.1\n");
+        fprintf(stderr, "gsr error: gsr_vaapi_load failed: missing required symbols in libva.so\n");
         goto fail;
     }
 
diff --git a/src/window_texture.c b/src/window_texture.c
index df34a37..741a145 100644
--- a/src/window_texture.c
+++ b/src/window_texture.c
@@ -88,6 +88,7 @@ int window_texture_on_resize(WindowTexture *self) {
     self->egl->glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
     self->egl->glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
 
+    while(self->egl->glGetError()) {}
     while(self->egl->eglGetError() != EGL_SUCCESS) {}
 
     image = self->egl->eglCreateImage(self->egl->egl_display, NULL, EGL_NATIVE_PIXMAP_KHR, (EGLClientBuffer)pixmap, pixmap_attrs);
diff --git a/src/xnvctrl.c b/src/xnvctrl.c
new file mode 100644
index 0000000..94d48d2
--- /dev/null
+++ b/src/xnvctrl.c
@@ -0,0 +1,44 @@
+#include "../include/xnvctrl.h"
+#include "../include/library_loader.h"
+#include <string.h>
+
+bool gsr_xnvctrl_load(gsr_xnvctrl *self, Display *display) {
+    memset(self, 0, sizeof(gsr_xnvctrl));
+    self->display = display;
+
+    dlerror(); /* clear */
+    void *lib = dlopen("libXNVCtrl.so.0", RTLD_LAZY);
+    if(!lib) {
+        fprintf(stderr, "gsr error: gsr_xnvctrl_load failed: failed to load libXNVCtrl.so.0, error: %s\n", dlerror());
+        return false;
+    }
+
+    dlsym_assign required_dlsym[] = {
+        { (void**)&self->XNVCTRLQueryExtension, "XNVCTRLQueryExtension" },
+        { (void**)&self->XNVCTRLSetTargetAttributeAndGetStatus, "XNVCTRLSetTargetAttributeAndGetStatus" },
+        { (void**)&self->XNVCTRLQueryValidTargetAttributeValues, "XNVCTRLQueryValidTargetAttributeValues" },
+        { (void**)&self->XNVCTRLQueryTargetStringAttribute, "XNVCTRLQueryTargetStringAttribute" },
+
+        { NULL, NULL }
+    };
+
+    if(!dlsym_load_list(lib, required_dlsym)) {
+        fprintf(stderr, "gsr error: gsr_xnvctrl_load failed: missing required symbols in libXNVCtrl.so.0\n");
+        goto fail;
+    }
+
+    self->library = lib;
+    return true;
+
+    fail:
+    dlclose(lib);
+    memset(self, 0, sizeof(gsr_xnvctrl));
+    return false;
+}
+
+void gsr_xnvctrl_unload(gsr_xnvctrl *self) {
+    if(self->library) {
+        dlclose(self->library);
+        memset(self, 0, sizeof(gsr_xnvctrl));
+    }
+}