vaapi wip

author: Steam Deck User <deck@arch.steamdeck> 2023-03-16 13:36:19 +0100
committer: dec05eba <dec05eba@protonmail.com> 2023-03-17 11:44:51 +0100
commit: 8cbdb596ebf79587a432ed40583630b6cd39ed88 (patch)
tree: 82c60558aaaa7a1fc1eb9ffc388f1dd84a9529d2
parent: 689419a78238626aba887e974cbfcf5dff99de81 (diff)
11 files changed, 688 insertions, 445 deletions
diff --git a/README.md b/README.md
index 6d16de4..6b7b4fa 100644
--- a/README.md
+++ b/README.md
@@ -26,7 +26,10 @@ If you are running another distro then you can run `install.sh` as root: `sudo .
 You can also install gpu screen recorder ([the gtk gui version](https://git.dec05eba.com/gpu-screen-recorder-gtk/)) from [flathub](https://flathub.org/apps/details/com.dec05eba.gpu_screen_recorder).
 
 # Dependencies
-`libglvnd (which provides libgl and libegl), (mesa if you are using an amd or intel gpu), ffmpeg (libavcodec, libavformat, libavutil, libswresample, libavfilter), libx11, libxcomposite, libpulse`. You need to additionally have `libcuda.so` installed when you run `gpu-screen-recorder` and `libnvidia-fbc.so.1` when using nvfbc.\
+## AMD/Intel
+`libglvnd (which provides libgl and libegl), mesa, ffmpeg (libavcodec, libavformat, libavutil, libswresample, libavfilter), libx11, libxcomposite, libpulse, libva.so`.
+## NVIDIA
+`libglvnd (which provides libgl and libegl), ffmpeg (libavcodec, libavformat, libavutil, libswresample, libavfilter), libx11, libxcomposite, libpulse, libcuda.so`. Additionally, you need to have `libnvidia-fbc.so.1` installed when using nvfbc.
 
 # How to use
 Run `scripts/interactive.sh` or run gpu-screen-recorder directly, for example: `gpu-screen-recorder -w $(xdotool selectwindow) -c mp4 -f 60 -a "$(pactl get-default-sink).monitor" -o test_video.mp4` then stop the screen recorder with Ctrl+C, which will also save the recording. You can change -w to -w screen if you want to record all monitors or if you want to record a specific monitor then you can use -w monitor-name, for example -w HDMI-0 (use xrandr command to find the name of your monitor. The name can also be found in your desktop environments display settings).\
@@ -56,8 +59,7 @@ The plugin does everything on the GPU and gives the texture to OBS, but OBS does
 FFMPEG only uses the GPU with CUDA when doing transcoding from an input video to an output video, and not when recording the screen when using x11grab. So FFMPEG has the same fps drop issues that OBS has.
 
 # TODO
-* Support AMD and Intel, using VAAPI. Currently there are a lot of driver bugs with both AMD and Intel that causes video encoding to either fail, performance issues or causes the entire driver to crash.
 * Dynamically change bitrate/resolution to match desired fps. This would be helpful when streaming for example, where the encode output speed also depends on upload speed to the streaming service.
-* Show cursor when recording. Currently the cursor is not visible when recording a window.
+* Show cursor when recording. Currently the cursor is not visible when recording a window or when using amd/intel.
 * Implement opengl injection to capture texture. This fixes VRR without having to use NvFBC direct capture.
 * Always use direct capture with NvFBC once the capture issue in mpv fullscreen has been resolved (maybe detect if direct capture fails in nvfbc and switch to non-direct recording. NvFBC says if direct capture fails).
diff --git a/TODO b/TODO
index e6484ce..46ec17b 100644
--- a/TODO
+++ b/TODO
@@ -13,4 +13,5 @@ Implement follow focused in drm.
 Support fullscreen capture on amd/intel using external kms process.
 Support amf and qsv.
 Disable flipping on nvidia? this might fix some stuttering issues on some setups. See NvCtrlGetAttribute/NvCtrlSetAttributeAndGetStatus NV_CTRL_SYNC_TO_VBLANK https://github.com/NVIDIA/nvidia-settings/blob/d5f022976368cbceb2f20b838ddb0bf992f0cfb9/src/gtk%2B-2.x/ctkopengl.c.
-Replays seem to have some issues with audio/video. Why?
-\ No newline at end of file
+Replays seem to have some issues with audio/video. Why?
+Cleanup unused gl/egl functions, macro, etc.
+\ No newline at end of file
diff --git a/build.sh b/build.sh
index 3d3745d..5989604 100755
--- a/build.sh
+++ b/build.sh
@@ -4,15 +4,17 @@
 dependencies="libavcodec libavformat libavutil x11 xcomposite xrandr libpulse libswresample libavfilter"
 includes="$(pkg-config --cflags $dependencies)"
 libs="$(pkg-config --libs $dependencies) -ldl -pthread -lm"
-gcc -c src/capture/capture.c -O2 -g0 -DNDEBUG $includes
-gcc -c src/capture/nvfbc.c -O2 -g0 -DNDEBUG $includes
-gcc -c src/capture/xcomposite_cuda.c -O2 -g0 -DNDEBUG $includes
-gcc -c src/capture/xcomposite_drm.c -O2 -g0 -DNDEBUG $includes
-gcc -c src/egl.c -O2 -g0 -DNDEBUG $includes
-gcc -c src/cuda.c -O2 -g0 -DNDEBUG $includes
-gcc -c src/window_texture.c -O2 -g0 -DNDEBUG $includes
-gcc -c src/time.c -O2 -g0 -DNDEBUG $includes
-g++ -c src/sound.cpp -O2 -g0 -DNDEBUG $includes
-g++ -c src/main.cpp -O2 -g0 -DNDEBUG $includes
-g++ -o gpu-screen-recorder -O2 capture.o nvfbc.o egl.o cuda.o window_texture.o time.o xcomposite_cuda.o xcomposite_drm.o sound.o main.o -s $libs
+opts="-O2 -g0 -DNDEBUG"
+gcc -c src/capture/capture.c $opts $includes
+gcc -c src/capture/nvfbc.c $opts $includes
+gcc -c src/capture/xcomposite_cuda.c $opts $includes
+gcc -c src/capture/xcomposite_drm.c $opts $includes
+gcc -c src/egl.c $opts $includes
+gcc -c src/cuda.c $opts $includes
+gcc -c src/vaapi.c $opts $includes
+gcc -c src/window_texture.c $opts $includes
+gcc -c src/time.c $opts $includes
+g++ -c src/sound.cpp $opts $includes
+g++ -c src/main.cpp $opts $includes
+g++ -o gpu-screen-recorder -O2 capture.o nvfbc.o egl.o cuda.o vaapi.o window_texture.o time.o xcomposite_cuda.o xcomposite_drm.o sound.o main.o -s $libs
 echo "Successfully built gpu-screen-recorder"
diff --git a/include/egl.h b/include/egl.h
index 6e8f763..f4c47ff 100644
--- a/include/egl.h
+++ b/include/egl.h
@@ -33,12 +33,14 @@ typedef void* EGLImageKHR;
 typedef void *GLeglImageOES;
 typedef void (*__eglMustCastToProperFunctionPointerType)(void);
 
+#define EGL_SUCCESS                             0x3000
 #define EGL_BUFFER_SIZE                         0x3020
 #define EGL_RENDERABLE_TYPE                     0x3040
 #define EGL_OPENGL_ES2_BIT                      0x0004
 #define EGL_NONE                                0x3038
 #define EGL_CONTEXT_CLIENT_VERSION              0x3098
 #define EGL_BACK_BUFFER                         0x3084
+#define EGL_GL_TEXTURE_2D                       0x30B1
 
 #define GL_TEXTURE_2D                           0x0DE1
 #define GL_RGB                                  0x1907
@@ -97,6 +99,7 @@ typedef struct {
     EGLContext egl_context;
     Window window;
 
+    int32_t (*eglGetError)(void);
     EGLDisplay (*eglGetDisplay)(EGLNativeDisplayType display_id);
     unsigned int (*eglInitialize)(EGLDisplay dpy, int32_t *major, int32_t *minor);
     unsigned int (*eglTerminate)(EGLDisplay dpy);
diff --git a/include/vaapi.h b/include/vaapi.h
new file mode 100644
index 0000000..7ec73b2
--- /dev/null
+++ b/include/vaapi.h
@@ -0,0 +1,63 @@
+#ifndef GSR_VAAPI_H
+#define GSR_VAAPI_H
+
+#include <stdint.h>
+#include <stdbool.h>
+
+typedef void* VADisplay;
+typedef int VAStatus;
+typedef unsigned int VAGenericID;
+typedef VAGenericID VASurfaceID;
+
+typedef struct {
+    /** Pixel format fourcc of the whole surface (VA_FOURCC_*). */
+    uint32_t fourcc;
+    /** Width of the surface in pixels. */
+    uint32_t width;
+    /** Height of the surface in pixels. */
+    uint32_t height;
+    /** Number of distinct DRM objects making up the surface. */
+    uint32_t num_objects;
+    /** Description of each object. */
+    struct {
+        /** DRM PRIME file descriptor for this object. */
+        int fd;
+        /** Total size of this object (may include regions which are
+         *  not part of the surface). */
+        uint32_t size;
+        /** Format modifier applied to this object. */
+        uint64_t drm_format_modifier;
+    } objects[4];
+    /** Number of layers making up the surface. */
+    uint32_t num_layers;
+    /** Description of each layer in the surface. */
+    struct {
+        /** DRM format fourcc of this layer (DRM_FOURCC_*). */
+        uint32_t drm_format;
+        /** Number of planes in this layer. */
+        uint32_t num_planes;
+        /** Index in the objects array of the object containing each
+         *  plane. */
+        uint32_t object_index[4];
+        /** Offset within the object of each plane. */
+        uint32_t offset[4];
+        /** Pitch of each plane. */
+        uint32_t pitch[4];
+    } layers[4];
+} VADRMPRIMESurfaceDescriptor;
+
+#define VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME_2      0x40000000
+#define VA_EXPORT_SURFACE_READ_WRITE                0x0003
+#define VA_EXPORT_SURFACE_SEPARATE_LAYERS           0x0004
+
+typedef struct {
+    void *library;
+    
+    VAStatus (*vaExportSurfaceHandle)(VADisplay dpy, VASurfaceID surface_id, uint32_t mem_type, uint32_t flags, void *descriptor);
+    VAStatus (*vaSyncSurface)(VADisplay dpy, VASurfaceID render_target);
+} gsr_vaapi;
+
+bool gsr_vaapi_load(gsr_vaapi *self);
+void gsr_vaapi_unload(gsr_vaapi *self);
+
+#endif /* GSR_VAAPI_H */
diff --git a/src/capture/xcomposite_drm.c b/src/capture/xcomposite_drm.c
index 6e3f7bb..9fb323d 100644
--- a/src/capture/xcomposite_drm.c
+++ b/src/capture/xcomposite_drm.c
@@ -1,5 +1,6 @@
 #include "../../include/capture/xcomposite_drm.h"
 #include "../../include/egl.h"
+#include "../../include/vaapi.h"
 #include "../../include/window_texture.h"
 #include "../../include/time.h"
 #include <stdlib.h>
@@ -7,7 +8,7 @@
 #include <X11/Xlib.h>
 #include <X11/extensions/Xcomposite.h>
 #include <libavutil/hwcontext.h>
-#include <libavutil/hwcontext_drm.h>
+#include <libavutil/hwcontext_vaapi.h>
 #include <libavutil/frame.h>
 #include <libavcodec/avcodec.h>
 //#include <drm_fourcc.h>
@@ -28,6 +29,7 @@ typedef struct {
     WindowTexture window_texture;
 
     gsr_egl egl;
+    gsr_vaapi vaapi;
 
     int fourcc;
     int num_planes;
@@ -36,12 +38,16 @@ typedef struct {
     int32_t stride;
     int32_t offset;
 
-    unsigned int target_texture_id;
+    unsigned int target_textures[2];
 
-    unsigned int FramebufferName;
-    unsigned int quad_VertexArrayID;
-    unsigned int quad_vertexbuffer;
+    unsigned int FramebufferNameY;
+    unsigned int FramebufferNameUV; // TODO: Remove
     unsigned int quadVAO;
+
+    unsigned int shader_y;
+    unsigned int shader_uv;
+
+    VADisplay va_dpy;
 } gsr_capture_xcomposite_drm;
 
 static int max_int(int a, int b) {
@@ -71,11 +77,16 @@ static bool drm_create_codec_context(gsr_capture_xcomposite_drm *cap_xcomp, AVCo
         (AVHWFramesContext *)frame_context->data;
     hw_frame_context->width = video_codec_context->width;
     hw_frame_context->height = video_codec_context->height;
-    hw_frame_context->sw_format = AV_PIX_FMT_YUV420P;//AV_PIX_FMT_0RGB32;//AV_PIX_FMT_YUV420P;//AV_PIX_FMT_0RGB32;//AV_PIX_FMT_NV12;
+    hw_frame_context->sw_format = AV_PIX_FMT_NV12;//AV_PIX_FMT_0RGB32;//AV_PIX_FMT_YUV420P;//AV_PIX_FMT_0RGB32;//AV_PIX_FMT_NV12;
     hw_frame_context->format = video_codec_context->pix_fmt;
     hw_frame_context->device_ref = device_ctx;
     hw_frame_context->device_ctx = (AVHWDeviceContext*)device_ctx->data;
 
+    hw_frame_context->initial_pool_size = 1;
+
+    AVVAAPIDeviceContext *vactx =((AVHWDeviceContext*)device_ctx->data)->hwctx;
+    cap_xcomp->va_dpy = vactx->display;
+
     if (av_hwframe_ctx_init(frame_context) < 0) {
         fprintf(stderr, "Error: Failed to initialize hardware frame context "
                         "(note: ffmpeg version needs to be > 4.0)\n");
@@ -89,45 +100,6 @@ static bool drm_create_codec_context(gsr_capture_xcomposite_drm *cap_xcomp, AVCo
     return true;
 }
 
-#define EGL_SURFACE_TYPE                  0x3033
-#define EGL_WINDOW_BIT                    0x0004
-#define EGL_PIXMAP_BIT                    0x0002
-#define EGL_BIND_TO_TEXTURE_RGB           0x3039
-#define EGL_TRUE                          1
-#define EGL_RED_SIZE                      0x3024
-#define EGL_GREEN_SIZE                    0x3023
-#define EGL_BLUE_SIZE                     0x3022
-#define EGL_ALPHA_SIZE                    0x3021
-#define EGL_TEXTURE_FORMAT                0x3080
-#define EGL_TEXTURE_RGB                   0x305D
-#define EGL_TEXTURE_TARGET                0x3081
-#define EGL_TEXTURE_2D                    0x305F
-#define EGL_GL_TEXTURE_2D                 0x30B1
-
-#define GL_RGBA                           0x1908
-
-static unsigned int gl_create_texture(gsr_capture_xcomposite_drm *cap_xcomp, int width, int height) {
-    // Generating this second texture is needed because
-    // cuGraphicsGLRegisterImage cant be used with the texture that is mapped
-    // directly to the pixmap.
-    // TODO: Investigate if it's somehow possible to use the pixmap texture
-    // directly, this should improve performance since only less image copy is
-    // then needed every frame.
-    // Ignoring failure for now.. TODO: Show proper error
-    unsigned int texture_id = 0;
-    cap_xcomp->egl.glGenTextures(1, &texture_id);
-    cap_xcomp->egl.glBindTexture(GL_TEXTURE_2D, texture_id);
-    cap_xcomp->egl.glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, width, height, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
-
-    cap_xcomp->egl.glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
-    cap_xcomp->egl.glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
-    cap_xcomp->egl.glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
-    cap_xcomp->egl.glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
-
-    cap_xcomp->egl.glBindTexture(GL_TEXTURE_2D, 0);
-    return texture_id;
-}
-
 #define GL_COMPILE_STATUS                 0x8B81
 #define GL_INFO_LOG_LENGTH                0x8B84
 
@@ -249,10 +221,7 @@ unsigned int esLoadProgram ( gsr_capture_xcomposite_drm *cap_xcomp, const char *
    return programObject;
 }
 
-static unsigned int shader_program = 0;
-static unsigned int texID = 0;
-
-static void LoadShaders(gsr_capture_xcomposite_drm *cap_xcomp) {
+static unsigned int LoadShadersY(gsr_capture_xcomposite_drm *cap_xcomp) {
 	char vShaderStr[] =
         "#version 300 es                                 \n"
         "in vec2 pos;                                    \n"
@@ -322,13 +291,164 @@ static void LoadShaders(gsr_capture_xcomposite_drm *cap_xcomp) {
         "#version 300 es                                           \n"
 		"precision mediump float;                                  \n"
         "in vec2 texcoords_out;                                        \n"
+        "uniform sampler2D tex1;                                    \n"
+        //"uniform sampler2D tex2;                                    \n"
+        "out vec4 FragColor;                                       \n"
+        //"out vec4 FragColor2;                                       \n"
+        "mat4 RGBtoYUV() {\n"
+        "   return mat4(\n"
+        "       vec4(0.257,  0.439, -0.148, 0.0),\n"
+        "      vec4(0.504, -0.368, -0.291, 0.0),\n"
+        "      vec4(0.098, -0.071,  0.439, 0.0),\n"
+        "      vec4(0.0625, 0.500,  0.500, 1.0)\n"
+        "   );\n"
+        "}\n"
+		"void main()                                               \n"
+		"{                                                         \n"
+        //"  vec3 yuv = rgb2yuv(texture(tex1, texcoords_out).rgb);             \n"
+		//"  FragColor.x = yuv.x;                            \n"
+        //"  FragColor2.xy = yuv.xy;                            \n"
+        //" vec3 rgb = texture(tex1, texcoords_out).rgb;\n"
+        "FragColor.x = (RGBtoYUV() * vec4(texture(tex1, texcoords_out).rgb, 1.0)).x;\n"
+        //"FragColor2.xy = (RGBtoYUV() * vec4(texture(tex1, texcoords_out*2.0).rgb, 1.0)).zy;\n"
+		"}                                                         \n";
+#else
+    char fShaderStr[] =
+        "#version 300 es                                           \n"
+		"precision mediump float;                                  \n"
+        "in vec2 texcoords_out;                                        \n"
         "uniform sampler2D tex;                                    \n"
         "out vec4 FragColor;                                       \n"
+
+        "vec3 rgb2yuv(vec3 rgb){\n"
+        "    float y = 0.299*rgb.r + 0.587*rgb.g + 0.114*rgb.b;\n"
+        "    return vec3(y, 0.493*(rgb.b-y), 0.877*(rgb.r-y));\n"
+        "}\n"
+
+        "vec3 yuv2rgb(vec3 yuv){\n"
+        "    float y = yuv.x;\n"
+        "    float u = yuv.y;\n"
+        "    float v = yuv.z;\n"
+        "    \n"
+        "    return vec3(\n"
+        "        y + 1.0/0.877*v,\n"
+        "        y - 0.39393*u - 0.58081*v,\n"
+        "        y + 1.0/0.493*u\n"
+        "    );\n"
+        "}\n"
+
 		"void main()                                               \n"
 		"{                                                         \n"
-        "  vec3 rgb = texture(tex, texcoords_out).rgb;             \n"
+        "   float s = 0.5;\n"
+        "    vec3 lum = texture(tex, texcoords_out).rgb;\n"
+        "    vec3 chr = texture(tex, floor(texcoords_out*s-.5)/s).rgb;\n"
+        "    vec3 rgb = vec3(rgb2yuv(lum).x, rgb2yuv(chr).yz);\n"
 		"  FragColor = vec4(rgb, 1.0);                            \n"
 		"}                                                         \n";
+#endif
+
+    unsigned int shader_program = esLoadProgram(cap_xcomp, vShaderStr, fShaderStr);
+	if (shader_program == 0) {
+        fprintf(stderr, "failed to create shader!\n");
+        return 0;
+    }
+
+    cap_xcomp->egl.glBindAttribLocation(shader_program, 0, "pos");
+    cap_xcomp->egl.glBindAttribLocation(shader_program, 1, "texcoords");
+	return shader_program;
+}
+
+static unsigned int LoadShadersUV(gsr_capture_xcomposite_drm *cap_xcomp) {
+	char vShaderStr[] =
+        "#version 300 es                                 \n"
+        "in vec2 pos;                                    \n"
+        "in vec2 texcoords;                              \n"
+        "out vec2 texcoords_out;                         \n"
+		"void main()                                     \n"
+		"{                                               \n"
+        "  texcoords_out = texcoords;                    \n"
+		"  gl_Position = vec4(pos.x, pos.y, 0.0, 1.0);   \n"
+		"}                                               \n";
+
+#if 0
+	char fShaderStr[] =
+        "#version 300 es                                           \n"
+		"precision mediump float;                                  \n"
+        "in vec2 texcoords_out;                                        \n"
+        "uniform sampler2D tex;                                    \n"
+        "out vec4 FragColor;                                       \n"
+
+
+        "float imageWidth = 1920.0;\n"
+        "float imageHeight = 1080.0;\n"
+
+        "float getYPixel(vec2 position) {\n"
+        "    position.y = (position.y * 2.0 / 3.0) + (1.0 / 3.0);\n"
+        "    return texture2D(tex, position).x;\n"
+        "}\n"
+"\n"
+        "vec2 mapCommon(vec2 position, float planarOffset) {\n"
+        "    planarOffset += (imageWidth * floor(position.y / 2.0)) / 2.0 +\n"
+        "                    floor((imageWidth - 1.0 - position.x) / 2.0);\n"
+        "    float x = floor(imageWidth - 1.0 - floor(mod(planarOffset, imageWidth)));\n"
+        "    float y = floor(floor(planarOffset / imageWidth));\n"
+        "    return vec2((x + 0.5) / imageWidth, (y + 0.5) / (1.5 * imageHeight));\n"
+        "}\n"
+"\n"
+        "vec2 mapU(vec2 position) {\n"
+        "    float planarOffset = (imageWidth * imageHeight) / 4.0;\n"
+        "    return mapCommon(position, planarOffset);\n"
+        "}\n"
+"\n"
+        "vec2 mapV(vec2 position) {\n"
+        "    return mapCommon(position, 0.0);\n"
+        "}\n"
+
+		"void main()                                               \n"
+		"{                                                         \n"
+
+        "vec2 pixelPosition = vec2(floor(imageWidth * texcoords_out.x),\n"
+        "                        floor(imageHeight * texcoords_out.y));\n"
+        "pixelPosition -= vec2(0.5, 0.5);\n"
+"\n"
+        "float yChannel = getYPixel(texcoords_out);\n"
+        "float uChannel = texture2D(tex, mapU(pixelPosition)).x;\n"
+        "float vChannel = texture2D(tex, mapV(pixelPosition)).x;\n"
+        "vec4 channels = vec4(yChannel, uChannel, vChannel, 1.0);\n"
+        "mat4 conversion = mat4(1.0,  0.0,    1.402, -0.701,\n"
+        "                        1.0, -0.344, -0.714,  0.529,\n"
+        "                        1.0,  1.772,  0.0,   -0.886,\n"
+        "                        0, 0, 0, 0);\n"
+        "vec3 rgb = (channels * conversion).xyz;\n"
+
+		"  FragColor = vec4(rgb, 1.0);                            \n"
+		"}                                                         \n";
+#elif 1
+    char fShaderStr[] =
+        "#version 300 es                                           \n"
+		"precision mediump float;                                  \n"
+        "in vec2 texcoords_out;                                        \n"
+        "uniform sampler2D tex1;                                    \n"
+        //"uniform sampler2D tex2;                                    \n"
+        "out vec4 FragColor;                                       \n"
+        //"out vec4 FragColor2;                                       \n"
+        "mat4 RGBtoYUV() {\n"
+        "   return mat4(\n"
+        "       vec4(0.257,  0.439, -0.148, 0.0),\n"
+        "      vec4(0.504, -0.368, -0.291, 0.0),\n"
+        "      vec4(0.098, -0.071,  0.439, 0.0),\n"
+        "      vec4(0.0625, 0.500,  0.500, 1.0)\n"
+        "   );\n"
+        "}\n"
+		"void main()                                               \n"
+		"{                                                         \n"
+        //"  vec3 yuv = rgb2yuv(texture(tex1, texcoords_out).rgb);             \n"
+		//"  FragColor.x = yuv.x;                            \n"
+        //"  FragColor2.xy = yuv.xy;                            \n"
+        //" vec3 rgb = texture(tex1, texcoords_out).rgb;\n"
+        //"FragColor.x = (RGBtoYUV() * vec4(texture(tex1, texcoords_out).rgb, 1.0)).x;\n"
+        "FragColor.xy = (RGBtoYUV() * vec4(texture(tex1, texcoords_out*2.0).rgb, 1.0)).zy;\n"
+		"}                                                         \n";
 #else
     char fShaderStr[] =
         "#version 300 es                                           \n"
@@ -364,15 +484,15 @@ static void LoadShaders(gsr_capture_xcomposite_drm *cap_xcomp) {
 		"}                                                         \n";
 #endif
 
-    shader_program = esLoadProgram(cap_xcomp, vShaderStr, fShaderStr);
+    unsigned int shader_program = esLoadProgram(cap_xcomp, vShaderStr, fShaderStr);
 	if (shader_program == 0) {
         fprintf(stderr, "failed to create shader!\n");
-        return;
+        return 0;
     }
 
     cap_xcomp->egl.glBindAttribLocation(shader_program, 0, "pos");
     cap_xcomp->egl.glBindAttribLocation(shader_program, 1, "texcoords");
-	return;
+	return shader_program;
 }
 
 #define GL_FLOAT				0x1406
@@ -381,12 +501,20 @@ static void LoadShaders(gsr_capture_xcomposite_drm *cap_xcomp) {
 #define GL_TRIANGLES				0x0004
 #define DRM_FORMAT_MOD_INVALID 72057594037927935
 
+#define EGL_TRUE                          1
+#define EGL_IMAGE_PRESERVED_KHR           0x30D2
+#define EGL_NATIVE_PIXMAP_KHR             0x30B0
+
+static uint32_t fourcc(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
+    return (d << 24) | (c << 16) | (b << 8) | a;
+}
+
 static int gsr_capture_xcomposite_drm_start(gsr_capture *cap, AVCodecContext *video_codec_context) {
     gsr_capture_xcomposite_drm *cap_xcomp = cap->priv;
 
     XWindowAttributes attr;
     if(!XGetWindowAttributes(cap_xcomp->dpy, cap_xcomp->params.window, &attr)) {
-        fprintf(stderr, "gsr error: gsr_capture_xcomposite_start failed: invalid window id: %lu\n", cap_xcomp->params.window);
+        fprintf(stderr, "gsr error: gsr_capture_xcomposite_drm_start failed: invalid window id: %lu\n", cap_xcomp->params.window);
         return -1;
     }
 
@@ -399,18 +527,24 @@ static int gsr_capture_xcomposite_drm_start(gsr_capture *cap, AVCodecContext *vi
     XSelectInput(cap_xcomp->dpy, cap_xcomp->params.window, StructureNotifyMask | ExposureMask);
 
     if(!gsr_egl_load(&cap_xcomp->egl, cap_xcomp->dpy)) {
-        fprintf(stderr, "gsr error: gsr_capture_xcomposite_start: failed to load opengl\n");
+        fprintf(stderr, "gsr error: gsr_capture_xcomposite_drm_start: failed to load opengl\n");
         return -1;
     }
 
     if(!cap_xcomp->egl.eglExportDMABUFImageQueryMESA) {
-        fprintf(stderr, "gsr error: gsr_capture_xcomposite_start: could not find eglExportDMABUFImageQueryMESA\n");
+        fprintf(stderr, "gsr error: gsr_capture_xcomposite_drm_start: could not find eglExportDMABUFImageQueryMESA\n");
         gsr_egl_unload(&cap_xcomp->egl);
         return -1;
     }
 
     if(!cap_xcomp->egl.eglExportDMABUFImageMESA) {
-        fprintf(stderr, "gsr error: gsr_capture_xcomposite_start: could not find eglExportDMABUFImageMESA\n");
+        fprintf(stderr, "gsr error: gsr_capture_xcomposite_drm_start: could not find eglExportDMABUFImageMESA\n");
+        gsr_egl_unload(&cap_xcomp->egl);
+        return -1;
+    }
+
+    if(!gsr_vaapi_load(&cap_xcomp->vaapi)) {
+        fprintf(stderr, "gsr error: gsr_capture_xcomposite_drm_start: failed to load vaapi\n");
         gsr_egl_unload(&cap_xcomp->egl);
         return -1;
     }
@@ -462,17 +596,6 @@ static int gsr_capture_xcomposite_drm_start(gsr_capture *cap, AVCodecContext *vi
     cap_xcomp->egl.glGetTexLevelParameteriv(GL_TEXTURE_2D, 0, GL_TEXTURE_HEIGHT, &cap_xcomp->texture_size.y);
     cap_xcomp->egl.glBindTexture(GL_TEXTURE_2D, 0);
 
-    #if 1
-    cap_xcomp->target_texture_id = gl_create_texture(cap_xcomp, cap_xcomp->texture_size.x, cap_xcomp->texture_size.y);
-    if(cap_xcomp->target_texture_id == 0) {
-        fprintf(stderr, "gsr error: gsr_capture_xcomposite_drm_start: failed to create opengl texture\n");
-        return -1;
-    }
-    #else
-    // TODO:
-    cap_xcomp->target_texture_id = window_texture_get_opengl_texture_id(&cap_xcomp->window_texture);
-    #endif
-
     cap_xcomp->texture_size.x = max_int(2, cap_xcomp->texture_size.x & ~1);
     cap_xcomp->texture_size.y = max_int(2, cap_xcomp->texture_size.y & ~1);
 
@@ -480,13 +603,18 @@ static int gsr_capture_xcomposite_drm_start(gsr_capture *cap, AVCodecContext *vi
     video_codec_context->height = cap_xcomp->texture_size.y;
 
     {
-        EGLImage img = cap_xcomp->egl.eglCreateImage(cap_xcomp->egl.egl_display, cap_xcomp->egl.egl_context, EGL_GL_TEXTURE_2D, (EGLClientBuffer)(uint64_t)cap_xcomp->target_texture_id, NULL);
+        const intptr_t pixmap_attrs[] = {
+            EGL_IMAGE_PRESERVED_KHR, EGL_TRUE,
+            EGL_NONE,
+        };
+
+        EGLImage img = cap_xcomp->egl.eglCreateImage(cap_xcomp->egl.egl_display, cap_xcomp->egl.egl_context, EGL_GL_TEXTURE_2D, (EGLClientBuffer)(uint64_t)window_texture_get_opengl_texture_id(&cap_xcomp->window_texture), pixmap_attrs);
         if(!img) {
             fprintf(stderr, "eglCreateImage failed\n");
             return -1;
         }
 
-        if(!cap_xcomp->egl.eglExportDMABUFImageQueryMESA(cap_xcomp->egl.egl_display, img, &cap_xcomp->fourcc, &cap_xcomp->num_planes, &cap_xcomp->modifiers) || cap_xcomp->modifiers == DRM_FORMAT_MOD_INVALID) {
+        if(!cap_xcomp->egl.eglExportDMABUFImageQueryMESA(cap_xcomp->egl.egl_display, img, &cap_xcomp->fourcc, &cap_xcomp->num_planes, &cap_xcomp->modifiers)) {
             fprintf(stderr, "eglExportDMABUFImageQueryMESA failed\n"); 
             return -1;
         }
@@ -502,102 +630,21 @@ static int gsr_capture_xcomposite_drm_start(gsr_capture *cap, AVCodecContext *vi
             return -1;
         }
 
-        fprintf(stderr, "texture: %u, dmabuf: %d, stride: %d, offset: %d\n", cap_xcomp->target_texture_id, cap_xcomp->dmabuf_fd, cap_xcomp->stride, cap_xcomp->offset);
+        fprintf(stderr, "texture: %u, dmabuf: %d, stride: %d, offset: %d\n", window_texture_get_opengl_texture_id(&cap_xcomp->window_texture), cap_xcomp->dmabuf_fd, cap_xcomp->stride, cap_xcomp->offset);
         fprintf(stderr, "fourcc: %d, num planes: %d, modifiers: %zu\n", cap_xcomp->fourcc, cap_xcomp->num_planes, cap_xcomp->modifiers);
     }
 
-    cap_xcomp->egl.glGenFramebuffers(1, &cap_xcomp->FramebufferName);
-    cap_xcomp->egl.glBindFramebuffer(GL_FRAMEBUFFER, cap_xcomp->FramebufferName);
-
-    cap_xcomp->egl.glFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, cap_xcomp->target_texture_id, 0);
-
-    // Set the list of draw buffers.
-    unsigned int DrawBuffers[1] = {GL_COLOR_ATTACHMENT0};
-    cap_xcomp->egl.glDrawBuffers(1, DrawBuffers); // "1" is the size of DrawBuffers
-
-    if(cap_xcomp->egl.glCheckFramebufferStatus(GL_FRAMEBUFFER) != GL_FRAMEBUFFER_COMPLETE) {
-        fprintf(stderr, "Failed to setup framebuffer\n");
-        return -1;
-    }
-
-    cap_xcomp->egl.glBindFramebuffer(GL_FRAMEBUFFER, 0);
-
-    //cap_xcomp->egl.glGenVertexArrays(1, &cap_xcomp->quad_VertexArrayID);
-    //cap_xcomp->egl.glBindVertexArray(cap_xcomp->quad_VertexArrayID);
-
-    static const float g_quad_vertex_buffer_data[] = {
-        -1.0f, -1.0f, 0.0f,
-        1.0f, -1.0f, 0.0f,
-        -1.0f,  1.0f, 0.0f,
-        -1.0f,  1.0f, 0.0f,
-        1.0f, -1.0f, 0.0f,
-        1.0f,  1.0f, 0.0f,
-    };
-
-    //cap_xcomp->egl.glGenBuffers(1, &cap_xcomp->quad_vertexbuffer);
-    //cap_xcomp->egl.glBindBuffer(GL_ARRAY_BUFFER, cap_xcomp->quad_vertexbuffer);
-    //cap_xcomp->egl.glBufferData(GL_ARRAY_BUFFER, sizeof(g_quad_vertex_buffer_data), g_quad_vertex_buffer_data, GL_STATIC_DRAW);
-
-    // Create and compile our GLSL program from the shaders
-    LoadShaders(cap_xcomp);
-    texID = cap_xcomp->egl.glGetUniformLocation(shader_program, "tex");
-    fprintf(stderr, "uniform id: %u\n", texID);
-
-    float vVertices[] = {
-        -1.0f,  1.0f,  0.0f, 1.0f,
-        -1.0f, -1.0f,  0.0f, 0.0f,
-         1.0f, -1.0f,  1.0f, 0.0f,
-
-        -1.0f,  1.0f,  0.0f, 1.0f,
-         1.0f, -1.0f,  1.0f, 0.0f,
-         1.0f,  1.0f,  1.0f, 1.0f
-    };
-
-    unsigned int quadVBO;
-    cap_xcomp->egl.glGenVertexArrays(1, &cap_xcomp->quadVAO);
-    cap_xcomp->egl.glGenBuffers(1, &quadVBO);
-    cap_xcomp->egl.glBindVertexArray(cap_xcomp->quadVAO);
-    cap_xcomp->egl.glBindBuffer(GL_ARRAY_BUFFER, quadVBO);
-    cap_xcomp->egl.glBufferData(GL_ARRAY_BUFFER, sizeof(vVertices), &vVertices, GL_STATIC_DRAW);
-
-    cap_xcomp->egl.glEnableVertexAttribArray(0);
-    cap_xcomp->egl.glVertexAttribPointer(0, 2, GL_FLOAT, GL_FALSE, 4 * sizeof(float), (void*)0);
-
-    cap_xcomp->egl.glEnableVertexAttribArray(1);
-    cap_xcomp->egl.glVertexAttribPointer(1, 2, GL_FLOAT, GL_FALSE, 4 * sizeof(float), (void*)(2 * sizeof(float)));
-
-    cap_xcomp->egl.glBindVertexArray(0);
-
-    //cap_xcomp->egl.glUniform1i(texID, window_texture_get_opengl_texture_id(&cap_xcomp->window_texture));
-
-    //cap_xcomp->egl.glViewport(0, 0, 1920, 1080);
-
-    //cap_xcomp->egl.glBindBuffer(GL_ARRAY_BUFFER, 0);
-    //cap_xcomp->egl.glBindVertexArray(0);
-
     if(!drm_create_codec_context(cap_xcomp, video_codec_context)) {
         fprintf(stderr, "failed to create hw codec context\n");
         gsr_egl_unload(&cap_xcomp->egl);
         return -1;
     }
 
-    fprintf(stderr, "sneed: %u\n", cap_xcomp->FramebufferName);
+    //fprintf(stderr, "sneed: %u\n", cap_xcomp->FramebufferName);
     return 0;
 #endif
 }
 
-// TODO:
-static void free_desc(void *opaque, uint8_t *data) {
-    AVDRMFrameDescriptor *desc = (AVDRMFrameDescriptor*)data;
-    int i;
-
-    //for (i = 0; i < desc->nb_objects; i++)
-    //    close(desc->objects[i].fd);
-
-    av_free(desc);
-}
-
-
 static void gsr_capture_xcomposite_drm_tick(gsr_capture *cap, AVCodecContext *video_codec_context, AVFrame **frame) {
     gsr_capture_xcomposite_drm *cap_xcomp = cap->priv;
 
@@ -606,77 +653,6 @@ static void gsr_capture_xcomposite_drm_tick(gsr_capture *cap, AVCodecContext *vi
     if(!cap_xcomp->created_hw_frame) {
         cap_xcomp->created_hw_frame = true;
 
-        /*if(av_hwframe_get_buffer(video_codec_context->hw_frames_ctx, *frame, 0) < 0) {
-            fprintf(stderr, "gsr error: gsr_capture_xcomposite_drm_tick: av_hwframe_get_buffer failed\n");
-            return;
-        }*/
-
-        AVDRMFrameDescriptor *desc = av_malloc(sizeof(AVDRMFrameDescriptor));
-        if(!desc) {
-            fprintf(stderr, "poop\n");
-            return;
-        }
-
-        fprintf(stderr, "tick fd: %d\n", cap_xcomp->dmabuf_fd);
-
-        cap_xcomp->egl.glBindTexture(GL_TEXTURE_2D, cap_xcomp->target_texture_id);
-        int xx = 0;
-        int yy = 0;
-        cap_xcomp->egl.glGetTexLevelParameteriv(GL_TEXTURE_2D, 0, GL_TEXTURE_WIDTH, &xx);
-        cap_xcomp->egl.glGetTexLevelParameteriv(GL_TEXTURE_2D, 0, GL_TEXTURE_HEIGHT, &yy);
-        cap_xcomp->egl.glBindTexture(GL_TEXTURE_2D, 0);
-
-        *desc = (AVDRMFrameDescriptor) {
-            .nb_objects = 1,
-            .objects[0] = {
-                .fd               = cap_xcomp->dmabuf_fd,
-                .size             = yy * cap_xcomp->stride,
-                .format_modifier  = cap_xcomp->modifiers,
-            },
-            .nb_layers = 1,
-            .layers[0] = {
-                .format           = cap_xcomp->fourcc, // DRM_FORMAT_NV12
-                .nb_planes        = 1, //cap_xcomp->num_planes, // TODO: Ensure this is 1, otherwise ffmpeg cant handle it in av_hwframe_map
-                .planes[0] = {
-                    .object_index = 0,
-                    .offset       = cap_xcomp->offset,
-                    .pitch        = cap_xcomp->stride,
-                },
-            },
-        };
-
-        #if 0
-        AVBufferRef *device_ctx;
-        if(av_hwdevice_ctx_create(&device_ctx, AV_HWDEVICE_TYPE_DRM, "/dev/dri/card0", NULL, 0) < 0) {
-            fprintf(stderr, "Error: Failed to create hardware device context\n");
-            return;
-        }
-
-        AVBufferRef *frame_context = av_hwframe_ctx_alloc(device_ctx);
-        if(!frame_context) {
-            fprintf(stderr, "Error: Failed to create hwframe context\n");
-            av_buffer_unref(&device_ctx);
-            return;
-        }
-
-        AVHWFramesContext *hw_frame_context =
-            (AVHWFramesContext *)frame_context->data;
-        hw_frame_context->width = video_codec_context->width;
-        hw_frame_context->height = video_codec_context->height;
-        hw_frame_context->sw_format = AV_PIX_FMT_0RGB32;
-        hw_frame_context->format = AV_PIX_FMT_DRM_PRIME;
-        hw_frame_context->device_ref = device_ctx;
-        hw_frame_context->device_ctx = (AVHWDeviceContext*)device_ctx->data;
-
-        if (av_hwframe_ctx_init(frame_context) < 0) {
-            fprintf(stderr, "Error: Failed to initialize hardware frame context "
-                            "(note: ffmpeg version needs to be > 4.0)\n");
-            av_buffer_unref(&device_ctx);
-            av_buffer_unref(&frame_context);
-            return;
-        }
-        #endif
-
         av_frame_free(frame);
         *frame = av_frame_alloc();
         if(!frame) {
@@ -694,32 +670,184 @@ static void gsr_capture_xcomposite_drm_tick(gsr_capture *cap, AVCodecContext *vi
             return;
         }
 
-        AVFrame *src_frame = av_frame_alloc();
-        assert(src_frame);
-        src_frame->format = AV_PIX_FMT_DRM_PRIME;
-        src_frame->width = video_codec_context->width;
-        src_frame->height = video_codec_context->height;
-        src_frame->color_range = AVCOL_RANGE_JPEG;
-
-        src_frame->buf[0] = av_buffer_create((uint8_t*)desc, sizeof(*desc),
-                                     &free_desc, video_codec_context, 0);
-        if (!src_frame->buf[0]) {
-            fprintf(stderr, "failed to create buffer!\n");
+        fprintf(stderr, "fourcc: %u\n", cap_xcomp->fourcc);
+        fprintf(stderr, "va surface id: %u\n", (VASurfaceID)(uintptr_t)(*frame)->data[3]);
+
+        VADRMPRIMESurfaceDescriptor prime;
+
+        VASurfaceID surface_id = (uintptr_t)(*frame)->data[3];
+        VAStatus va_status = cap_xcomp->vaapi.vaExportSurfaceHandle(cap_xcomp->va_dpy, surface_id, VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME_2, VA_EXPORT_SURFACE_READ_WRITE | VA_EXPORT_SURFACE_SEPARATE_LAYERS, &prime); // TODO: Composed layers
+        if(va_status != VA_STATUS_SUCCESS) {
+            fprintf(stderr, "vaExportSurfaceHandle failed\n");
             return;
         }
+        cap_xcomp->vaapi.vaSyncSurface(cap_xcomp->va_dpy, surface_id);
+
+        fprintf(stderr, "fourcc: %u, width: %u, height: %u\n", prime.fourcc, prime.width, prime.height);
+        for(int i = 0; i < prime.num_layers; ++i) {
+            fprintf(stderr, "  drm format: %u, num planes: %u\n", prime.layers[i].drm_format, prime.layers[i].num_planes);
+            for(int j = 0; j < prime.layers[i].num_planes; ++j) {
+                const uint32_t object_index = prime.layers[i].object_index[j];
+                fprintf(stderr, "    object index: %u, offset: %u, pitch: %u, fd: %d, size: %u, drm format mod: %lu\n", object_index, prime.layers[i].offset[j], prime.layers[i].pitch[j], prime.objects[object_index].fd, prime.objects[object_index].size, prime.objects[object_index].drm_format_modifier);
+            }
+        }
 
-        src_frame->data[0] = (uint8_t*)desc;
-        src_frame->extended_data = src_frame->data;
-        src_frame->format  = AV_PIX_FMT_DRM_PRIME;
+        #define EGL_LINUX_DRM_FOURCC_EXT          0x3271
+        #define EGL_WIDTH                         0x3057
+        #define EGL_HEIGHT                        0x3056
+        #define EGL_DMA_BUF_PLANE0_FD_EXT         0x3272
+        #define EGL_DMA_BUF_PLANE0_OFFSET_EXT     0x3273
+        #define EGL_DMA_BUF_PLANE0_PITCH_EXT      0x3274
+        #define EGL_LINUX_DMA_BUF_EXT             0x3270
+
+        #define GL_TEXTURE0				0x84C0
+        #define GL_COLOR_ATTACHMENT1              0x8CE1
+
+        #define FOURCC_NV12 842094158
+
+        if(prime.fourcc == FOURCC_NV12) { // This happens on AMD
+            while(cap_xcomp->egl.eglGetError() != EGL_SUCCESS){}
+
+            EGLImage images[2];
+            cap_xcomp->egl.glGenTextures(2, cap_xcomp->target_textures);
+            assert(cap_xcomp->egl.glGetError() == 0);
+            for(int i = 0; i < 2; ++i) {
+                const uint32_t formats[2] = { fourcc('R', '8', ' ', ' '), fourcc('G', 'R', '8', '8') };
+                const int layer = i;
+                const int plane = 0;
+
+                const intptr_t img_attr[] = {
+                    EGL_LINUX_DRM_FOURCC_EXT,   formats[i],
+                    EGL_WIDTH,                  prime.width / (1 + i), // half size
+                    EGL_HEIGHT,                 prime.height / (1 + i), // for chroma
+                    EGL_DMA_BUF_PLANE0_FD_EXT,  prime.objects[prime.layers[layer].object_index[plane]].fd,
+                    EGL_DMA_BUF_PLANE0_OFFSET_EXT,  prime.layers[layer].offset[plane],
+                    EGL_DMA_BUF_PLANE0_PITCH_EXT,  prime.layers[layer].pitch[plane],
+                    EGL_NONE
+                };
+                images[i] = cap_xcomp->egl.eglCreateImage(cap_xcomp->egl.egl_display, 0, EGL_LINUX_DMA_BUF_EXT, NULL, img_attr); // TODO: Cleanup at the end of this for loop
+                assert(images[i]);
+                assert(cap_xcomp->egl.eglGetError() == EGL_SUCCESS);
+
+                //cap_xcomp->egl.glActiveTexture(GL_TEXTURE0 + i);
+                cap_xcomp->egl.glBindTexture(GL_TEXTURE_2D, cap_xcomp->target_textures[i]);
+                assert(cap_xcomp->egl.glGetError() == 0);
+
+                cap_xcomp->egl.glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
+                cap_xcomp->egl.glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
+                cap_xcomp->egl.glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
+                cap_xcomp->egl.glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
+                assert(cap_xcomp->egl.glGetError() == 0);
+
+                cap_xcomp->egl.glEGLImageTargetTexture2DOES(GL_TEXTURE_2D, images[i]);
+                assert(cap_xcomp->egl.glGetError() == 0);
+                assert(cap_xcomp->egl.eglGetError() == EGL_SUCCESS);
+            }
+            //cap_xcomp->egl.glActiveTexture(GL_TEXTURE0);
+            cap_xcomp->egl.glBindTexture(GL_TEXTURE_2D, 0);
+
+
+
+            cap_xcomp->egl.glGenFramebuffers(1, &cap_xcomp->FramebufferNameY);
+            cap_xcomp->egl.glBindFramebuffer(GL_FRAMEBUFFER, cap_xcomp->FramebufferNameY);
+
+            cap_xcomp->egl.glFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, cap_xcomp->target_textures[0], 0);
+           // cap_xcomp->egl.glFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT1, GL_TEXTURE_2D, cap_xcomp->target_textures[1], 0);
+
+            // Set the list of draw buffers.
+            unsigned int DrawBuffers[1] = {GL_COLOR_ATTACHMENT0};
+            cap_xcomp->egl.glDrawBuffers(1, DrawBuffers); // "1" is the size of DrawBuffers
+
+            if(cap_xcomp->egl.glCheckFramebufferStatus(GL_FRAMEBUFFER) != GL_FRAMEBUFFER_COMPLETE) {
+                fprintf(stderr, "Failed to setup framebuffer\n");
+                return;
+            }
+
+            cap_xcomp->egl.glBindFramebuffer(GL_FRAMEBUFFER, 0);
+
+            cap_xcomp->egl.glGenFramebuffers(1, &cap_xcomp->FramebufferNameUV);
+            cap_xcomp->egl.glBindFramebuffer(GL_FRAMEBUFFER, cap_xcomp->FramebufferNameUV);
+
+            cap_xcomp->egl.glFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, cap_xcomp->target_textures[1], 0);
+           // cap_xcomp->egl.glFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT1, GL_TEXTURE_2D, cap_xcomp->target_textures[1], 0);
+
+            // Set the list of draw buffers.
+            cap_xcomp->egl.glDrawBuffers(1, DrawBuffers); // "1" is the size of DrawBuffers
+
+            if(cap_xcomp->egl.glCheckFramebufferStatus(GL_FRAMEBUFFER) != GL_FRAMEBUFFER_COMPLETE) {
+                fprintf(stderr, "Failed to setup framebuffer\n");
+                return;
+            }
+
+            cap_xcomp->egl.glBindFramebuffer(GL_FRAMEBUFFER, 0);
+
+            //cap_xcomp->egl.glGenVertexArrays(1, &cap_xcomp->quad_VertexArrayID);
+            //cap_xcomp->egl.glBindVertexArray(cap_xcomp->quad_VertexArrayID);
+
+            static const float g_quad_vertex_buffer_data[] = {
+                -1.0f, -1.0f, 0.0f,
+                1.0f, -1.0f, 0.0f,
+                -1.0f,  1.0f, 0.0f,
+                -1.0f,  1.0f, 0.0f,
+                1.0f, -1.0f, 0.0f,
+                1.0f,  1.0f, 0.0f,
+            };
+
+            //cap_xcomp->egl.glGenBuffers(1, &cap_xcomp->quad_vertexbuffer);
+            //cap_xcomp->egl.glBindBuffer(GL_ARRAY_BUFFER, cap_xcomp->quad_vertexbuffer);
+            //cap_xcomp->egl.glBufferData(GL_ARRAY_BUFFER, sizeof(g_quad_vertex_buffer_data), g_quad_vertex_buffer_data, GL_STATIC_DRAW);
+
+            // Create and compile our GLSL program from the shaders
+            cap_xcomp->shader_y = LoadShadersY(cap_xcomp);
+            cap_xcomp->shader_uv = LoadShadersUV(cap_xcomp);
+            //int tex1 = cap_xcomp->egl.glGetUniformLocation(cap_xcomp->shader_y, "tex1");
+            //cap_xcomp->egl.glUniform1i(tex1, 0);
+            //tex1 = cap_xcomp->egl.glGetUniformLocation(cap_xcomp->shader_uv, "tex1");
+            //cap_xcomp->egl.glUniform1i(tex1, 0);
+            //int tex2 = cap_xcomp->egl.glGetUniformLocation(shader_program, "tex2");
+            //fprintf(stderr, "uniform id: %u\n", tex1);
+
+            float vVertices[] = {
+                -1.0f,  1.0f,  0.0f, 1.0f,
+                -1.0f, -1.0f,  0.0f, 0.0f,
+                1.0f, -1.0f,  1.0f, 0.0f,
+
+                -1.0f,  1.0f,  0.0f, 1.0f,
+                1.0f, -1.0f,  1.0f, 0.0f,
+                1.0f,  1.0f,  1.0f, 1.0f
+            };
+
+            unsigned int quadVBO;
+            cap_xcomp->egl.glGenVertexArrays(1, &cap_xcomp->quadVAO);
+            cap_xcomp->egl.glGenBuffers(1, &quadVBO);
+            cap_xcomp->egl.glBindVertexArray(cap_xcomp->quadVAO);
+            cap_xcomp->egl.glBindBuffer(GL_ARRAY_BUFFER, quadVBO);
+            cap_xcomp->egl.glBufferData(GL_ARRAY_BUFFER, sizeof(vVertices), &vVertices, GL_STATIC_DRAW);
+
+            cap_xcomp->egl.glEnableVertexAttribArray(0);
+            cap_xcomp->egl.glVertexAttribPointer(0, 2, GL_FLOAT, GL_FALSE, 4 * sizeof(float), (void*)0);
+
+            cap_xcomp->egl.glEnableVertexAttribArray(1);
+            cap_xcomp->egl.glVertexAttribPointer(1, 2, GL_FLOAT, GL_FALSE, 4 * sizeof(float), (void*)(2 * sizeof(float)));
+
+            cap_xcomp->egl.glBindVertexArray(0);
 
-        res = av_hwframe_map(*frame, src_frame, AV_HWFRAME_MAP_DIRECT);
-        if(res < 0) {
-            fprintf(stderr, "av_hwframe_map failed: %d\n", res);
+            //cap_xcomp->egl.glUniform1i(tex1, 0);
+            //cap_xcomp->egl.glUniform1i(tex2, 1);
+
+            //cap_xcomp->egl.glViewport(0, 0, 1920, 1080);
+
+            //cap_xcomp->egl.glBindBuffer(GL_ARRAY_BUFFER, 0);
+            //cap_xcomp->egl.glBindVertexArray(0);
+        } else { // This happens on intel
+            fprintf(stderr, "unexpected fourcc: %u, expected nv12\n", prime.fourcc);
+            abort();
         }
 
         // Clear texture with black background because the source texture (window_texture_get_opengl_texture_id(&cap_xcomp->window_texture))
         // might be smaller than cap_xcomp->target_texture_id
-        cap_xcomp->egl.glClearTexImage(cap_xcomp->target_texture_id, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
+        // TODO:
+        //cap_xcomp->egl.glClearTexImage(cap_xcomp->target_texture_id, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
     }
 }
 
@@ -732,92 +860,35 @@ static bool gsr_capture_xcomposite_drm_should_stop(gsr_capture *cap, bool *err)
 #define GL_TRUE					1
 #define GL_TRIANGLES				0x0004
 
-void FBO_2_PPM_file(gsr_capture_xcomposite_drm *cap_xcomp, int output_width, int output_height)
-{
-    FILE    *output_image;
-
-    /// READ THE PIXELS VALUES from FBO AND SAVE TO A .PPM FILE
-    int             i, j, k;
-    unsigned char   *pixels = (unsigned char*)malloc(output_width*output_height*3);
-
-    unsigned int err = cap_xcomp->egl.glGetError();
-    fprintf(stderr, "opengl err 1: %u\n", err);
-
-    /// READ THE CONTENT FROM THE FBO
-    cap_xcomp->egl.glReadBuffer(GL_COLOR_ATTACHMENT0);
-
-    err = cap_xcomp->egl.glGetError();
-    fprintf(stderr, "opengl err 2: %u\n", err);
-
-    cap_xcomp->egl.glReadPixels(0, 0, output_width, output_height, GL_RGBA, GL_UNSIGNED_BYTE, pixels);
-
-    err = cap_xcomp->egl.glGetError();
-    fprintf(stderr, "opengl err 3: %u\n", err);
-
-    output_image = fopen("output.ppm", "wb");
-    fprintf(output_image,"P3\n");
-    fprintf(output_image,"# Created by Ricao\n");
-    fprintf(output_image,"%d %d\n",output_width,output_height);
-    fprintf(output_image,"255\n");
-
-    k = 0;
-    for(i=0; i<output_width; i++)
-    {
-        for(j=0; j<output_height; j++)
-        {
-            fprintf(output_image,"%u %u %u ",(unsigned int)pixels[k],(unsigned int)pixels[k+1],
-                                             (unsigned int)pixels[k+2]);
-            k = k+4;
-        }
-        fprintf(output_image,"\n");
-    }
-    free(pixels);
-    fclose(output_image);
-}
-
 static int gsr_capture_xcomposite_drm_capture(gsr_capture *cap, AVFrame *frame) {
     gsr_capture_xcomposite_drm *cap_xcomp = cap->priv;
     vec2i source_size = cap_xcomp->texture_size;
 
-    #if 1
-    /* TODO: Remove this copy, which is only possible by using nvenc directly and encoding window_pixmap.target_texture_id */
-    cap_xcomp->egl.glCopyImageSubData(
-        window_texture_get_opengl_texture_id(&cap_xcomp->window_texture), GL_TEXTURE_2D, 0, 0, 0, 0,
-        cap_xcomp->target_texture_id, GL_TEXTURE_2D, 0, 0, 0, 0,
-        source_size.x, source_size.y, 1);
-    unsigned int err = cap_xcomp->egl.glGetError();
-    if(err != 0) {
-        static bool error_shown = false;
-        if(!error_shown) {
-            error_shown = true;
-            fprintf(stderr, "Error: glCopyImageSubData failed, gl error: %d\n", err);
-        }
+    cap_xcomp->egl.glBindVertexArray(cap_xcomp->quadVAO);
+    cap_xcomp->egl.glViewport(0, 0, source_size.x, source_size.y);
+    cap_xcomp->egl.glBindTexture(GL_TEXTURE_2D, window_texture_get_opengl_texture_id(&cap_xcomp->window_texture));
+
+    {
+        cap_xcomp->egl.glBindFramebuffer(GL_FRAMEBUFFER, cap_xcomp->FramebufferNameY);
+        //cap_xcomp->egl.glClear(GL_COLOR_BUFFER_BIT);
+
+        cap_xcomp->egl.glUseProgram(cap_xcomp->shader_y);
+        cap_xcomp->egl.glDrawArrays(GL_TRIANGLES, 0, 6);
     }
-    #elif 0
-    cap_xcomp->egl.glBindFramebuffer(GL_FRAMEBUFFER, cap_xcomp->FramebufferName);
-    cap_xcomp->egl.glViewport(0, 0, 1920, 1080);
-    //cap_xcomp->egl.glClearColor(0.0f, 0.0f, 0.0f, 1.0f);
-    cap_xcomp->egl.glClear(GL_COLOR_BUFFER_BIT);
 
-    cap_xcomp->egl.glUseProgram(shader_program);
-    cap_xcomp->egl.glBindTexture(GL_TEXTURE_2D, window_texture_get_opengl_texture_id(&cap_xcomp->window_texture));
-    cap_xcomp->egl.glBindVertexArray(cap_xcomp->quadVAO);
-    cap_xcomp->egl.glDrawArrays(GL_TRIANGLES, 0, 6);
-    cap_xcomp->egl.glBindTexture(GL_TEXTURE_2D, 0);
+    {
+        cap_xcomp->egl.glBindFramebuffer(GL_FRAMEBUFFER, cap_xcomp->FramebufferNameUV);
+        //cap_xcomp->egl.glClear(GL_COLOR_BUFFER_BIT);
 
-    static int counter = 0;
-    ++counter;
-    static bool image_saved = false;
-    if(!image_saved && counter == 5) {
-        image_saved = true;
-        FBO_2_PPM_file(cap_xcomp, 1920, 1080);
-        fprintf(stderr, "saved image!\n");
+        cap_xcomp->egl.glUseProgram(cap_xcomp->shader_uv);
+        cap_xcomp->egl.glDrawArrays(GL_TRIANGLES, 0, 6);
     }
 
     cap_xcomp->egl.glBindVertexArray(0);
     cap_xcomp->egl.glUseProgram(0);
+    cap_xcomp->egl.glBindTexture(GL_TEXTURE_2D, 0);
     cap_xcomp->egl.glBindFramebuffer(GL_FRAMEBUFFER, 0);
-    #endif
+
     cap_xcomp->egl.eglSwapBuffers(cap_xcomp->egl.egl_display, cap_xcomp->egl.egl_surface);
 
     return 0;
@@ -825,10 +896,15 @@ static int gsr_capture_xcomposite_drm_capture(gsr_capture *cap, AVFrame *frame)
 
 static void gsr_capture_xcomposite_drm_destroy(gsr_capture *cap, AVCodecContext *video_codec_context) {
     (void)video_codec_context;
+    gsr_capture_xcomposite_drm *cap_xcomp = cap->priv;
     if(cap->priv) {
         free(cap->priv);
         cap->priv = NULL;
     }
+    if(cap_xcomp->dpy) {
+        XCloseDisplay(cap_xcomp->dpy);
+        cap_xcomp->dpy = NULL;
+    }
     free(cap);
 }
 
diff --git a/src/cuda.c b/src/cuda.c
index 0c42d74..3076ebe 100644
--- a/src/cuda.c
+++ b/src/cuda.c
@@ -37,15 +37,13 @@ bool gsr_cuda_load(gsr_cuda *self) {
         { NULL, NULL }
     };
 
+    CUresult res;
+
     if(!dlsym_load_list(lib, required_dlsym)) {
         fprintf(stderr, "gsr error: gsr_cuda_load failed: missing required symbols in libcuda.so/libcuda.so.1\n");
-        dlclose(lib);
-        memset(self, 0, sizeof(gsr_cuda));
-        return false;
+        goto fail;
     }
 
-    CUresult res;
-
     res = self->cuInit(0);
     if(res != CUDA_SUCCESS) {
         const char *err_str = "unknown";
diff --git a/src/egl.c b/src/egl.c
index dbc8928..325a06f 100644
--- a/src/egl.c
+++ b/src/egl.c
@@ -57,7 +57,10 @@ static bool gsr_egl_create_window(gsr_egl *self) {
         goto fail;
     }
 
-    self->eglMakeCurrent(egl_display, egl_surface, egl_surface, egl_context);
+    if(!self->eglMakeCurrent(egl_display, egl_surface, egl_surface, egl_context)) {
+        fprintf(stderr, "gsr error: gsr_egl_create_window failed: failed to make context current\n");
+        goto fail;
+    }
 
     self->egl_display = egl_display;
     self->egl_surface = egl_surface;
@@ -79,6 +82,7 @@ static bool gsr_egl_create_window(gsr_egl *self) {
 
 static bool gsr_egl_load_egl(gsr_egl *self, void *library) {
     dlsym_assign required_dlsym[] = {
+        { (void**)&self->eglGetError, "eglGetError" },
         { (void**)&self->eglGetDisplay, "eglGetDisplay" },
         { (void**)&self->eglInitialize, "eglInitialize" },
         { (void**)&self->eglTerminate, "eglTerminate" },
@@ -182,52 +186,45 @@ bool gsr_egl_load(gsr_egl *self, Display *dpy) {
     memset(self, 0, sizeof(gsr_egl));
     self->dpy = dpy;
 
+    void *egl_lib = NULL;
+    void *gl_lib = NULL;
+
     dlerror(); /* clear */
-    void *egl_lib = dlopen("libEGL.so.1", RTLD_LAZY);
+    egl_lib = dlopen("libEGL.so.1", RTLD_LAZY);
     if(!egl_lib) {
         fprintf(stderr, "gsr error: gsr_egl_load: failed to load libEGL.so.1, error: %s\n", dlerror());
-        return false;
+        goto fail;
     }
 
-    void *gl_lib = dlopen("libGL.so.1", RTLD_LAZY);
+    gl_lib = dlopen("libGL.so.1", RTLD_LAZY);
     if(!egl_lib) {
         fprintf(stderr, "gsr error: gsr_egl_load: failed to load libGL.so.1, error: %s\n", dlerror());
-        dlclose(egl_lib);
-        memset(self, 0, sizeof(gsr_egl));
-        return false;
+        goto fail;
     }
 
-    if(!gsr_egl_load_egl(self, egl_lib)) {
-        dlclose(egl_lib);
-        dlclose(gl_lib);
-        memset(self, 0, sizeof(gsr_egl));
-        return false;
-    }
+    if(!gsr_egl_load_egl(self, egl_lib))
+        goto fail;
 
-    if(!gsr_egl_load_gl(self, gl_lib)) {
-        dlclose(egl_lib);
-        dlclose(gl_lib);
-        memset(self, 0, sizeof(gsr_egl));
-        return false;
-    }
+    if(!gsr_egl_load_gl(self, gl_lib))
+        goto fail;
 
-    if(!gsr_egl_proc_load_egl(self)) {
-        dlclose(egl_lib);
-        dlclose(gl_lib);
-        memset(self, 0, sizeof(gsr_egl));
-        return false;
-    }
+    if(!gsr_egl_proc_load_egl(self))
+        goto fail;
 
-    if(!gsr_egl_create_window(self)) {
-        dlclose(egl_lib);
-        dlclose(gl_lib);
-        memset(self, 0, sizeof(gsr_egl));
-        return false;
-    }
+    if(!gsr_egl_create_window(self))
+        goto fail;
 
     self->egl_library = egl_lib;
     self->gl_library = gl_lib;
     return true;
+
+    fail:
+    if(egl_lib)
+        dlclose(egl_lib);
+    if(gl_lib)
+        dlclose(gl_lib);
+    memset(self, 0, sizeof(gsr_egl));
+    return false;
 }
 
 void gsr_egl_unload(gsr_egl *self) {
diff --git a/src/main.cpp b/src/main.cpp
index 5581e77..1c6dad9 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -335,7 +335,7 @@ static AVCodecContext* create_audio_codec_context(int fps, AudioCodec audio_code
 
 static AVCodecContext *create_video_codec_context(AVPixelFormat pix_fmt,
                             VideoQuality video_quality,
-                            int fps, const AVCodec *codec, bool is_livestream) {
+                            int fps, const AVCodec *codec, bool is_livestream, gpu_vendor vendor) {
 
     AVCodecContext *codec_context = avcodec_alloc_context3(codec);
 
@@ -415,6 +415,13 @@ static AVCodecContext *create_video_codec_context(AVPixelFormat pix_fmt,
 
     av_opt_set_int(codec_context->priv_data, "b_ref_mode", 0, 0);
 
+    if(vendor != GPU_VENDOR_NVIDIA) {
+        // TODO: More options, better options
+        //codec_context->bit_rate = codec_context->width * codec_context->height;
+        av_opt_set(codec_context->priv_data, "rc_mode", "CQP", 0);
+        codec_context->global_quality = 4;
+    }
+
     //codec_context->rc_max_rate = codec_context->bit_rate;
     //codec_context->rc_min_rate = codec_context->bit_rate;
     //codec_context->rc_buffer_size = codec_context->bit_rate / 10;
@@ -424,10 +431,14 @@ static AVCodecContext *create_video_codec_context(AVPixelFormat pix_fmt,
     return codec_context;
 }
 
-static bool check_if_codec_valid_for_hardware(const AVCodec *codec) {
+static bool check_if_codec_valid_for_hardware(const AVCodec *codec, gpu_vendor vendor) {
+    // TODO: For now we assume that amd and intel always support h264 and hevc, but we default to h264
+    if(vendor != GPU_VENDOR_NVIDIA)
+        return true;
+
     bool success = false;
     // Do not use AV_PIX_FMT_CUDA because we dont want to do full check with hardware context
-    AVCodecContext *codec_context = create_video_codec_context(AV_PIX_FMT_YUV420P, VideoQuality::VERY_HIGH, 60, codec, false);
+    AVCodecContext *codec_context = create_video_codec_context(AV_PIX_FMT_YUV420P, VideoQuality::VERY_HIGH, 60, codec, false, vendor);
     codec_context->width = 1920;
     codec_context->height = 1080;
     if(codec_context) {
@@ -446,7 +457,7 @@ static const AVCodec* find_h264_encoder(gpu_vendor vendor) {
     static bool checked_success = true;
     if(!checked) {
         checked = true;
-        if(!check_if_codec_valid_for_hardware(codec))
+        if(!check_if_codec_valid_for_hardware(codec, vendor))
             checked_success = false;
     }
     return checked_success ? codec : nullptr;
@@ -466,7 +477,7 @@ static const AVCodec* find_h265_encoder(gpu_vendor vendor) {
     static bool checked_success = true;
     if(!checked) {
         checked = true;
-        if(!check_if_codec_valid_for_hardware(codec))
+        if(!check_if_codec_valid_for_hardware(codec, vendor))
             checked_success = false;
     }
     return checked_success ? codec : nullptr;
@@ -508,36 +519,78 @@ static AVFrame* open_audio(AVCodecContext *audio_codec_context) {
     return frame;
 }
 
-static void open_video(AVCodecContext *codec_context, VideoQuality video_quality, bool very_old_gpu) {
-    bool supports_p4 = false;
-    bool supports_p6 = false;
-
-    const AVOption *opt = nullptr;
-    while((opt = av_opt_next(codec_context->priv_data, opt))) {
-        if(opt->type == AV_OPT_TYPE_CONST) {
-            if(strcmp(opt->name, "p4") == 0)
-                supports_p4 = true;
-            else if(strcmp(opt->name, "p6") == 0)
-                supports_p6 = true;
+static void open_video(AVCodecContext *codec_context, VideoQuality video_quality, bool very_old_gpu, gpu_vendor vendor) {
+    AVDictionary *options = nullptr;
+    if(vendor == GPU_VENDOR_NVIDIA) {
+        bool supports_p4 = false;
+        bool supports_p6 = false;
+
+        const AVOption *opt = nullptr;
+        while((opt = av_opt_next(codec_context->priv_data, opt))) {
+            if(opt->type == AV_OPT_TYPE_CONST) {
+                if(strcmp(opt->name, "p4") == 0)
+                    supports_p4 = true;
+                else if(strcmp(opt->name, "p6") == 0)
+                    supports_p6 = true;
+            }
         }
-    }
 
-    AVDictionary *options = nullptr;
-    if(very_old_gpu) {
-        switch(video_quality) {
-            case VideoQuality::MEDIUM:
-                av_dict_set_int(&options, "qp", 37, 0);
-                break;
-            case VideoQuality::HIGH:
-                av_dict_set_int(&options, "qp", 32, 0);
-                break;
-            case VideoQuality::VERY_HIGH:
-                av_dict_set_int(&options, "qp", 27, 0);
-                break;
-            case VideoQuality::ULTRA:
-                av_dict_set_int(&options, "qp", 21, 0);
-                break;
+        if(very_old_gpu) {
+            switch(video_quality) {
+                case VideoQuality::MEDIUM:
+                    av_dict_set_int(&options, "qp", 37, 0);
+                    break;
+                case VideoQuality::HIGH:
+                    av_dict_set_int(&options, "qp", 32, 0);
+                    break;
+                case VideoQuality::VERY_HIGH:
+                    av_dict_set_int(&options, "qp", 27, 0);
+                    break;
+                case VideoQuality::ULTRA:
+                    av_dict_set_int(&options, "qp", 21, 0);
+                    break;
+            }
+        } else {
+            switch(video_quality) {
+                case VideoQuality::MEDIUM:
+                    av_dict_set_int(&options, "qp", 40, 0);
+                    break;
+                case VideoQuality::HIGH:
+                    av_dict_set_int(&options, "qp", 35, 0);
+                    break;
+                case VideoQuality::VERY_HIGH:
+                    av_dict_set_int(&options, "qp", 30, 0);
+                    break;
+                case VideoQuality::ULTRA:
+                    av_dict_set_int(&options, "qp", 24, 0);
+                    break;
+            }
         }
+
+        if(!supports_p4 && !supports_p6)
+            fprintf(stderr, "Info: your ffmpeg version is outdated. It's recommended that you use the flatpak version of gpu-screen-recorder version instead, which you can find at https://flathub.org/apps/details/com.dec05eba.gpu_screen_recorder\n");
+
+        //if(is_livestream) {
+        //    av_dict_set_int(&options, "zerolatency", 1, 0);
+        //    //av_dict_set(&options, "preset", "llhq", 0);
+        //}
+
+        // Fuck nvidia and ffmpeg, I want to use a good preset for the gpu but all gpus prefer different
+        // presets. Nvidia and ffmpeg used to support "hq" preset that chose the best preset for the gpu
+        // with pretty good performance but you now have to choose p1-p7, which are gpu agnostic and on
+        // older gpus p5-p7 slow the gpu down to a crawl...
+        // "hq" is now just an alias for p7 in ffmpeg :(
+        // TODO: Temporary disable because of stuttering?
+        if(very_old_gpu)
+            av_dict_set(&options, "preset", supports_p4 ? "p4" : "medium", 0);
+        else
+            av_dict_set(&options, "preset", supports_p6 ? "p6" : "slow", 0);
+
+        av_dict_set(&options, "tune", "hq", 0);
+        av_dict_set(&options, "rc", "constqp", 0);
+
+        if(codec_context->codec_id == AV_CODEC_ID_H264)
+            av_dict_set(&options, "profile", "high", 0);
     } else {
         switch(video_quality) {
             case VideoQuality::MEDIUM:
@@ -553,32 +606,19 @@ static void open_video(AVCodecContext *codec_context, VideoQuality video_quality
                 av_dict_set_int(&options, "qp", 24, 0);
                 break;
         }
-    }
-
-    if(!supports_p4 && !supports_p6)
-        fprintf(stderr, "Info: your ffmpeg version is outdated. It's recommended that you use the flatpak version of gpu-screen-recorder version instead, which you can find at https://flathub.org/apps/details/com.dec05eba.gpu_screen_recorder\n");
 
-    //if(is_livestream) {
-    //    av_dict_set_int(&options, "zerolatency", 1, 0);
-    //    //av_dict_set(&options, "preset", "llhq", 0);
-    //}
+        // TODO: More quality options
+        av_dict_set(&options, "rc_mode", "CQP", 0);
+        //av_dict_set_int(&options, "low_power", 1, 0);
 
-    // Fuck nvidia and ffmpeg, I want to use a good preset for the gpu but all gpus prefer different
-    // presets. Nvidia and ffmpeg used to support "hq" preset that chose the best preset for the gpu
-    // with pretty good performance but you now have to choose p1-p7, which are gpu agnostic and on
-    // older gpus p5-p7 slow the gpu down to a crawl...
-    // "hq" is now just an alias for p7 in ffmpeg :(
-    // TODO: Temporary disable because of stuttering?
-    if(very_old_gpu)
-        av_dict_set(&options, "preset", supports_p4 ? "p4" : "medium", 0);
-    else
-        av_dict_set(&options, "preset", supports_p6 ? "p6" : "slow", 0);
-
-    av_dict_set(&options, "tune", "hq", 0);
-    av_dict_set(&options, "rc", "constqp", 0);
-
-    if(codec_context->codec_id == AV_CODEC_ID_H264)
-        av_dict_set(&options, "profile", "high", 0);
+        if(codec_context->codec_id == AV_CODEC_ID_H264) {
+            av_dict_set(&options, "profile", "high", 0);
+            av_dict_set(&options, "coder", "cavlc", 0);// TODO: cavlc is faster than cabac but worse compression. Which to use?
+            av_dict_set_int(&options, "quality", 50, 0);
+        } else {
+            av_dict_set(&options, "profile", "main", 0);
+        }
+    }
 
     av_dict_set(&options, "strict", "experimental", 0);
 
@@ -602,7 +642,7 @@ static void usage() {
     fprintf(stderr, "  -r    Replay buffer size in seconds. If this is set, then only the last seconds as set by this option will be stored"
         " and the video will only be saved when the gpu-screen-recorder is closed. This feature is similar to Nvidia's instant replay feature."
         " This option has be between 5 and 1200. Note that the replay buffer size will not always be precise, because of keyframes. Optional, disabled by default.\n");
-    fprintf(stderr, "  -k    Video codec to use. Should be either 'auto', 'h264' or 'h265'. Defaults to 'auto' which defaults to 'h265' unless recording at a higher resolution than 3840x2160. Forcefully set to 'h264' if -c is 'flv'.\n");
+    fprintf(stderr, "  -k    Video codec to use. Should be either 'auto', 'h264' or 'h265'. Defaults to 'auto' which defaults to 'h265' on nvidia unless recording at a higher resolution than 3840x2160. On AMD/Intel this defaults to 'auto' which defaults to 'h264'. Forcefully set to 'h264' if -c is 'flv'.\n");
     fprintf(stderr, "  -ac   Audio codec to use. Should be either 'aac', 'opus' or 'flac'. Defaults to 'opus' for .mp4/.mkv files, otherwise defaults to 'aac'. 'opus' and 'flac' is only supported by .mp4/.mkv files. 'opus' is recommended for best performance and smallest audio size.\n");
     fprintf(stderr, "  -o    The output file path. If omitted then the encoded data is sent to stdout. Required in replay mode (when using -r). In replay mode this has to be an existing directory instead of a file.\n");
     fprintf(stderr, "NOTES:\n");
@@ -1387,23 +1427,41 @@ int main(int argc, char **argv) {
     const double target_fps = 1.0 / (double)fps;
 
     if(strcmp(video_codec_to_use, "auto") == 0) {
-        const AVCodec *h265_codec = find_h265_encoder(gpu_inf.vendor);
-
-        // h265 generally allows recording at a higher resolution than h264 on nvidia cards. On a gtx 1080 4k is the max resolution for h264 but for h265 it's 8k.
-        // Another important info is that when recording at a higher fps than.. 60? h265 has very bad performance. For example when recording at 144 fps the fps drops to 1
-        // while with h264 the fps doesn't drop.
-        if(!h265_codec) {
-            fprintf(stderr, "Info: using h264 encoder because a codec was not specified and your gpu does not support h265\n");
-            video_codec_to_use = "h264";
-            video_codec = VideoCodec::H264;
-        } else if(fps > 60) {
-            fprintf(stderr, "Info: using h264 encoder because a codec was not specified and fps is more than 60\n");
-            video_codec_to_use = "h264";
-            video_codec = VideoCodec::H264;
+        if(gpu_inf.vendor == GPU_VENDOR_NVIDIA) {
+            const AVCodec *h265_codec = find_h265_encoder(gpu_inf.vendor);
+
+            // h265 generally allows recording at a higher resolution than h264 on nvidia cards. On a gtx 1080 4k is the max resolution for h264 but for h265 it's 8k.
+            // Another important info is that when recording at a higher fps than.. 60? h265 has very bad performance. For example when recording at 144 fps the fps drops to 1
+            // while with h264 the fps doesn't drop.
+            if(!h265_codec) {
+                fprintf(stderr, "Info: using h264 encoder because a codec was not specified and your gpu does not support h265\n");
+                video_codec_to_use = "h264";
+                video_codec = VideoCodec::H264;
+            } else if(fps > 60) {
+                fprintf(stderr, "Info: using h264 encoder because a codec was not specified and fps is more than 60\n");
+                video_codec_to_use = "h264";
+                video_codec = VideoCodec::H264;
+            } else {
+                fprintf(stderr, "Info: using h265 encoder because a codec was not specified\n");
+                video_codec_to_use = "h265";
+                video_codec = VideoCodec::H265;
+            }
         } else {
-            fprintf(stderr, "Info: using h265 encoder because a codec was not specified\n");
-            video_codec_to_use = "h265";
-            video_codec = VideoCodec::H265;
+            const AVCodec *h264_codec = find_h264_encoder(gpu_inf.vendor);
+
+            if(!h264_codec) {
+                fprintf(stderr, "Info: using h265 encoder because a codec was not specified and your gpu does not support h264\n");
+                video_codec_to_use = "h265";
+                video_codec = VideoCodec::H265;
+            //} else if(fps > 60) {
+            //    fprintf(stderr, "Info: using h264 encoder because a codec was not specified and fps is more than 60\n");
+            //    video_codec_to_use = "h264";
+            //    video_codec = VideoCodec::H264;
+            } else {
+                fprintf(stderr, "Info: using h264 encoder because a codec was not specified\n");
+                video_codec_to_use = "h264";
+                video_codec = VideoCodec::H264;
+            }
         }
     }
 
@@ -1442,7 +1500,7 @@ int main(int argc, char **argv) {
     AVStream *video_stream = nullptr;
     std::vector<AudioTrack> audio_tracks;
 
-    AVCodecContext *video_codec_context = create_video_codec_context(gpu_inf.vendor == GPU_VENDOR_NVIDIA ? AV_PIX_FMT_CUDA : AV_PIX_FMT_VAAPI, quality, fps, video_codec_f, is_livestream);
+    AVCodecContext *video_codec_context = create_video_codec_context(gpu_inf.vendor == GPU_VENDOR_NVIDIA ? AV_PIX_FMT_CUDA : AV_PIX_FMT_VAAPI, quality, fps, video_codec_f, is_livestream, gpu_inf.vendor);
     if(replay_buffer_size_secs == -1)
         video_stream = create_stream(av_format_context, video_codec_context);
 
@@ -1451,7 +1509,7 @@ int main(int argc, char **argv) {
         return 1;
     }
 
-    open_video(video_codec_context, quality, very_old_gpu);
+    open_video(video_codec_context, quality, very_old_gpu, gpu_inf.vendor);
     if(video_stream)
         avcodec_parameters_from_context(video_stream->codecpar, video_codec_context);
 
diff --git a/src/vaapi.c b/src/vaapi.c
new file mode 100644
index 0000000..bb1b1fd
--- /dev/null
+++ b/src/vaapi.c
@@ -0,0 +1,41 @@
+#include "../include/vaapi.h"
+#include "../include/library_loader.h"
+#include <string.h>
+
+bool gsr_vaapi_load(gsr_vaapi *self) {
+    memset(self, 0, sizeof(gsr_vaapi));
+
+    dlerror(); /* clear */
+    void *lib = dlopen("libva.so.2", RTLD_LAZY);
+    if(!lib) {
+        fprintf(stderr, "gsr error: gsr_vaapi_load failed: failed to load libva.so, error: %s\n", dlerror());
+        return false;
+    }
+
+    dlsym_assign required_dlsym[] = {
+        { (void**)&self->vaExportSurfaceHandle, "vaExportSurfaceHandle" },
+        { (void**)&self->vaSyncSurface, "vaSyncSurface" },
+
+        { NULL, NULL }
+    };
+
+    if(!dlsym_load_list(lib, required_dlsym)) {
+        fprintf(stderr, "gsr error: gsr_vaapi_load failed: missing required symbols in libcuda.so/libcuda.so.1\n");
+        goto fail;
+    }
+
+    self->library = lib;
+    return true;
+
+    fail:
+    dlclose(lib);
+    memset(self, 0, sizeof(gsr_vaapi));
+    return false;
+}
+
+void gsr_vaapi_unload(gsr_vaapi *self) {
+    if(self->library) {
+        dlclose(self->library);
+        memset(self, 0, sizeof(gsr_vaapi));
+    }
+}
diff --git a/src/window_texture.c b/src/window_texture.c
index 72a2474..df34a37 100644
--- a/src/window_texture.c
+++ b/src/window_texture.c
@@ -85,8 +85,10 @@ int window_texture_on_resize(WindowTexture *self) {
 
     self->egl->glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
     self->egl->glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
-    self->egl->glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
-    self->egl->glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+    self->egl->glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
+    self->egl->glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
+
+    while(self->egl->eglGetError() != EGL_SUCCESS) {}
 
     image = self->egl->eglCreateImage(self->egl->egl_display, NULL, EGL_NATIVE_PIXMAP_KHR, (EGLClientBuffer)pixmap, pixmap_attrs);
     if(!image) {
@@ -95,7 +97,7 @@ int window_texture_on_resize(WindowTexture *self) {
     }
 
     self->egl->glEGLImageTargetTexture2DOES(GL_TEXTURE_2D, image);
-    if(self->egl->glGetError() != 0) {
+    if(self->egl->glGetError() != 0 || self->egl->eglGetError() != EGL_SUCCESS) {
         result = 5;
         goto cleanup;
     }
author	Steam Deck User <deck@arch.steamdeck>	2023-03-16 13:36:19 +0100
committer	dec05eba <dec05eba@protonmail.com>	2023-03-17 11:44:51 +0100
commit	8cbdb596ebf79587a432ed40583630b6cd39ed88 (patch)
tree	82c60558aaaa7a1fc1eb9ffc388f1dd84a9529d2
parent	689419a78238626aba887e974cbfcf5dff99de81 (diff)