aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authordec05eba <dec05eba@protonmail.com>2022-04-04 06:13:52 +0200
committerdec05eba <dec05eba@protonmail.com>2022-04-05 04:06:47 +0200
commit6a01677e236b76097e93de1cee06b05ab0aa0efa (patch)
tree6832222c2754ac4725232430adefbb8155298f84
parentc43fa5e4eeee78c0990a6343b6c6246422b3ab9c (diff)
Remove unecessary cuda memcpy when using nvfbc
-rw-r--r--.gitignore3
-rw-r--r--TODO6
-rw-r--r--src/main.cpp101
3 files changed, 41 insertions, 69 deletions
diff --git a/.gitignore b/.gitignore
index 0cf74c6..7d676bc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,6 +7,5 @@ tests/compile_commands.json
.clangd/
.cache/
-main.o
-sound.o
+*.o
gpu-screen-recorder
diff --git a/TODO b/TODO
index e082557..59432d6 100644
--- a/TODO
+++ b/TODO
@@ -1,11 +1,7 @@
Check for reparent.
Only add window to list if its the window is a topmost window.
-Use nvEncoder api directly? maybe with this we could copy the window opengl texture directly to the gpu which doesn't work right now for some reason.
- Right now we are required to copy the opengl texture to another opengl texture first.
- nvEncRegisterResource allows registering an opengl texture directly with NV_ENC_INPUT_RESOURCE_OPENGL_TEX and using that directly in the encoding.
Load cuda at runtime with dlopen.
Track window damages and only update then. That is better for output file size.
-Remove cuda to cuda copy when using nvFBC if possible. ffmpeg is getting in the way.
Getting the texture of a window when using a compositor is an nvidia specific limitation. When gpu-screen-recorder supports other gpus then this can be ignored.
Remove dependency on glfw (and glew?).
-Quickly changing workspace and back while recording under i3 breaks the screen recorder. The resize is triggered and it fails to recreate texture (fail to get texture size, texture id probably == 0).
+Quickly changing workspace and back while recording under i3 breaks the screen recorder. i3 probably unmaps windows in other workspaces.
diff --git a/src/main.cpp b/src/main.cpp
index 280e3f3..ea1f2fc 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -310,23 +310,6 @@ static bool recreate_window_pixmap(Display *dpy, Window window_id,
return pixmap.texture_id != 0 && pixmap.target_texture_id != 0;
}
-std::vector<std::string> get_hardware_acceleration_device_names() {
- int iGpu = 0;
- int nGpu = 0;
- cuDeviceGetCount(&nGpu);
- if (iGpu < 0 || iGpu >= nGpu) {
- fprintf(stderr, "Error: failed...\n");
- return {};
- }
-
- CUdevice cuDevice = 0;
- cuDeviceGet(&cuDevice, iGpu);
- char deviceName[80];
- cuDeviceGetName(deviceName, sizeof(deviceName), cuDevice);
- fprintf(stderr, "device name: %s\n", deviceName);
- return {deviceName};
-}
-
// |stream| is only required for non-replay mode
static void receive_frames(AVCodecContext *av_codec_context, int stream_index, AVStream *stream, AVFrame *frame,
AVFormatContext *av_format_context,
@@ -438,7 +421,6 @@ static AVCodecContext *create_video_codec_context(AVFormatContext *av_format_con
assert(codec->type == AVMEDIA_TYPE_VIDEO);
codec_context->codec_id = codec->id;
- fprintf(stderr, "codec id: %d\n", codec->id);
codec_context->width = record_width & ~1;
codec_context->height = record_height & ~1;
codec_context->bit_rate = 7500000 + (codec_context->width * codec_context->height) / 2;
@@ -464,6 +446,7 @@ static AVCodecContext *create_video_codec_context(AVFormatContext *av_format_con
//av_opt_set(codec_context->priv_data, "preset", "slow", 0);
//av_opt_set(codec_context->priv_data, "profile", "high", 0);
//codec_context->profile = FF_PROFILE_H264_HIGH;
+ av_opt_set(codec_context->priv_data, "preset", "p4", 0);
break;
case VideoQuality::HIGH:
codec_context->qmin = 12;
@@ -471,6 +454,7 @@ static AVCodecContext *create_video_codec_context(AVFormatContext *av_format_con
//av_opt_set(codec_context->priv_data, "preset", "slow", 0);
//av_opt_set(codec_context->priv_data, "profile", "high", 0);
//codec_context->profile = FF_PROFILE_H264_HIGH;
+ av_opt_set(codec_context->priv_data, "preset", "p6", 0);
break;
case VideoQuality::ULTRA:
codec_context->bit_rate = 10000000 + (codec_context->width * codec_context->height) / 2;
@@ -479,6 +463,7 @@ static AVCodecContext *create_video_codec_context(AVFormatContext *av_format_con
//av_opt_set(codec_context->priv_data, "preset", "veryslow", 0);
//av_opt_set(codec_context->priv_data, "profile", "high", 0);
//codec_context->profile = FF_PROFILE_H264_HIGH;
+ av_opt_set(codec_context->priv_data, "preset", "p7", 0);
break;
}
if (codec_context->codec_id == AV_CODEC_ID_MPEG1VIDEO)
@@ -486,6 +471,7 @@ static AVCodecContext *create_video_codec_context(AVFormatContext *av_format_con
// stream->time_base = codec_context->time_base;
// codec_context->ticks_per_frame = 30;
+ av_opt_set(codec_context->priv_data, "tune", "hq", 0);
// Some formats want stream headers to be seperate
if (av_format_context->oformat->flags & AVFMT_GLOBALHEADER)
@@ -524,24 +510,20 @@ static AVFrame* open_audio(AVCodecContext *audio_codec_context) {
static void open_video(AVCodecContext *codec_context,
WindowPixmap &window_pixmap, AVBufferRef **device_ctx,
- CUgraphicsResource *cuda_graphics_resource) {
+ CUgraphicsResource *cuda_graphics_resource, CUcontext cuda_context) {
int ret;
- std::vector<std::string> hardware_accelerated_devices =
- get_hardware_acceleration_device_names();
- if (hardware_accelerated_devices.empty()) {
- fprintf(
- stderr,
- "Error: No hardware accelerated device was found on your system\n");
+ *device_ctx = av_hwdevice_ctx_alloc(AV_HWDEVICE_TYPE_CUDA);
+ if(!*device_ctx) {
+ fprintf(stderr, "Error: Failed to create hardware device context\n");
exit(1);
}
- if (av_hwdevice_ctx_create(device_ctx, AV_HWDEVICE_TYPE_CUDA,
- hardware_accelerated_devices[0].c_str(), NULL,
- 0) < 0) {
- fprintf(stderr,
- "Error: Failed to create hardware device context for gpu: %s\n",
- hardware_accelerated_devices[0].c_str());
+ AVHWDeviceContext *hw_device_context = (AVHWDeviceContext *)(*device_ctx)->data;
+ AVCUDADeviceContext *cuda_device_context = (AVCUDADeviceContext *)hw_device_context->hwctx;
+ cuda_device_context->cuda_ctx = cuda_context;
+ if(av_hwdevice_ctx_init(*device_ctx) < 0) {
+ fprintf(stderr, "Error: Failed to create hardware device context\n");
exit(1);
}
@@ -576,21 +558,11 @@ static void open_video(AVCodecContext *codec_context,
exit(1);
}
- AVHWDeviceContext *hw_device_context =
- (AVHWDeviceContext *)(*device_ctx)->data;
- AVCUDADeviceContext *cuda_device_context =
- (AVCUDADeviceContext *)hw_device_context->hwctx;
- CUcontext *cuda_context = &(cuda_device_context->cuda_ctx);
- if (!cuda_context) {
- fprintf(stderr, "Error: No cuda context\n");
- exit(1);
- }
-
if(window_pixmap.target_texture_id != 0) {
CUresult res;
CUcontext old_ctx;
res = cuCtxPopCurrent(&old_ctx);
- res = cuCtxPushCurrent(*cuda_context);
+ res = cuCtxPushCurrent(cuda_context);
res = cuGraphicsGLRegisterImage(
cuda_graphics_resource, window_pixmap.target_texture_id, GL_TEXTURE_2D,
CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY);
@@ -930,21 +902,34 @@ int main(int argc, char **argv) {
res = cuInit(0);
if(res != CUDA_SUCCESS) {
- fprintf(stderr, "Error: cuInit failed (result: %d)\n", res);
- return {};
+ const char *err_str;
+ cuGetErrorString(res, &err_str);
+ fprintf(stderr, "Error: cuInit failed, error %s (result: %d)\n", err_str, res);
+ return 1;
+ }
+
+ int nGpu = 0;
+ cuDeviceGetCount(&nGpu);
+ if (nGpu <= 0) {
+ fprintf(stderr, "Error: no cuda supported devices found\n");
+ return 1;
}
CUdevice cu_dev;
res = cuDeviceGet(&cu_dev, 0);
if(res != CUDA_SUCCESS) {
- fprintf(stderr, "Unable to get CUDA device (result: %d)\n", res);
+ const char *err_str;
+ cuGetErrorString(res, &err_str);
+ fprintf(stderr, "Error: unable to get CUDA device, error: %s (result: %d)\n", err_str, res);
return 1;
}
CUcontext cu_ctx;
res = cuCtxCreate_v2(&cu_ctx, CU_CTX_SCHED_AUTO, cu_dev);
if(res != CUDA_SUCCESS) {
- fprintf(stderr, "Unable to create CUDA context (result: %d)\n", res);
+ const char *err_str;
+ cuGetErrorString(res, &err_str);
+ fprintf(stderr, "Error: unable to create CUDA context, error: %s (result: %d)\n", err_str, res);
return 1;
}
@@ -1124,7 +1109,7 @@ int main(int argc, char **argv) {
AVBufferRef *device_ctx;
CUgraphicsResource cuda_graphics_resource;
- open_video(video_codec_context, window_pixmap, &device_ctx, &cuda_graphics_resource);
+ open_video(video_codec_context, window_pixmap, &device_ctx, &cuda_graphics_resource, cu_ctx);
if(video_stream)
avcodec_parameters_from_context(video_stream->codecpar, video_codec_context);
@@ -1161,16 +1146,6 @@ int main(int argc, char **argv) {
}
}
- AVHWDeviceContext *hw_device_context =
- (AVHWDeviceContext *)device_ctx->data;
- AVCUDADeviceContext *cuda_device_context =
- (AVCUDADeviceContext *)hw_device_context->hwctx;
- CUcontext *cuda_context = &(cuda_device_context->cuda_ctx);
- if (!cuda_context) {
- fprintf(stderr, "Error: No cuda context\n");
- exit(1);
- }
-
// av_frame_free(&rgb_frame);
// avcodec_close(av_codec_context);
@@ -1195,7 +1170,7 @@ int main(int argc, char **argv) {
CUarray mapped_array;
if(src_window_id) {
res = cuCtxPopCurrent(&old_ctx);
- res = cuCtxPushCurrent(*cuda_context);
+ res = cuCtxPushCurrent(cu_ctx);
// Get texture
res = cuGraphicsResourceSetMapFlags(
@@ -1431,6 +1406,8 @@ int main(int argc, char **argv) {
// int err = glGetError();
// fprintf(stderr, "error: %d\n", err);
+ // TODO: Remove this copy, which is only possible by using nvenc directly and encoding window_pixmap.target_texture_id
+
CUDA_MEMCPY2D memcpy_struct;
memcpy_struct.srcXInBytes = 0;
memcpy_struct.srcY = 0;
@@ -1449,11 +1426,11 @@ int main(int argc, char **argv) {
frame_captured = true;
} else {
- uint32_t byte_size;
- CUdeviceptr src_cu_device_ptr;
+ // TODO: Check when src_cu_device_ptr changes and re-register resource
+ uint32_t byte_size = 0;
+ CUdeviceptr src_cu_device_ptr = 0;
frame_captured = nv_fbc_library.capture(&src_cu_device_ptr, &byte_size);
- if(frame_captured)
- cuMemcpyDtoD((CUdeviceptr)frame->data[0], src_cu_device_ptr, byte_size);
+ frame->data[0] = (uint8_t*)src_cu_device_ptr;
}
// res = cuCtxPopCurrent(&old_ctx);
}