1 files changed, 138 insertions, 103 deletions
diff --git a/src/main.cpp b/src/main.cpp
index 98740c3..1c5024b 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -46,6 +46,8 @@ extern "C" {
 
 // TODO: Remove LIBAVUTIL_VERSION_MAJOR checks in the future when ubuntu, pop os LTS etc update ffmpeg to >= 5.0
 
+static const int AUDIO_SAMPLE_RATE = 48000;
+
 static const int VIDEO_STREAM_INDEX = 0;
 
 static thread_local char av_error_buffer[AV_ERROR_MAX_STRING_SIZE];
@@ -176,7 +178,7 @@ static void receive_frames(AVCodecContext *av_codec_context, int stream_index, A
             } else {
                 av_packet_rescale_ts(av_packet, av_codec_context->time_base, stream->time_base);
                 av_packet->stream_index = stream->index;
-                // TODO: Is av_interleaved_write_frame needed?
+                // TODO: Is av_interleaved_write_frame needed?. Answer: might be needed for mkv but dont use it! it causes frames to be inconsistent, skipping frames and duplicating frames
                 int ret = av_write_frame(av_format_context, av_packet);
                 if(ret < 0) {
                     fprintf(stderr, "Error: Failed to write frame index %d to muxer, reason: %s (%d)\n", av_packet->stream_index, av_error_to_string(ret), ret);
@@ -305,7 +307,7 @@ static AVCodecContext* create_audio_codec_context(int fps, AudioCodec audio_code
     codec_context->codec_id = codec->id;
     codec_context->sample_fmt = audio_codec_get_sample_format(audio_codec, codec, mix_audio);
     codec_context->bit_rate = audio_bitrate == 0 ? audio_codec_get_get_bitrate(audio_codec) : audio_bitrate;
-    codec_context->sample_rate = 48000;
+    codec_context->sample_rate = AUDIO_SAMPLE_RATE;
     if(audio_codec == AudioCodec::AAC)
         codec_context->profile = FF_PROFILE_AAC_LOW;
 #if LIBAVCODEC_VERSION_MAJOR < 60
@@ -316,7 +318,7 @@ static AVCodecContext* create_audio_codec_context(int fps, AudioCodec audio_code
 #endif
 
     codec_context->time_base.num = 1;
-    codec_context->time_base.den = AV_TIME_BASE;
+    codec_context->time_base.den = codec_context->sample_rate;
     codec_context->thread_count = 1;
     codec_context->flags |= AV_CODEC_FLAG_GLOBAL_HEADER;
 
@@ -414,19 +416,21 @@ static AVCodecContext *create_video_codec_context(AVPixelFormat pix_fmt,
     codec_context->bit_rate = 0;
     #endif
 
+    // 8 bit / 10 bit = 80%, and increase it even more
+    const float quality_multiply = hdr ? (8.0f/10.0f * 0.7f) : 1.0f;
     if(vendor != GSR_GPU_VENDOR_NVIDIA) {
         switch(video_quality) {
             case VideoQuality::MEDIUM:
-                codec_context->global_quality = 180;
+                codec_context->global_quality = 180 * quality_multiply;
                 break;
             case VideoQuality::HIGH:
-                codec_context->global_quality = 140;
+                codec_context->global_quality = 140 * quality_multiply;
                 break;
             case VideoQuality::VERY_HIGH:
-                codec_context->global_quality = 120;
+                codec_context->global_quality = 120 * quality_multiply;
                 break;
             case VideoQuality::ULTRA:
-                codec_context->global_quality = 100;
+                codec_context->global_quality = 100 * quality_multiply;
                 break;
         }
     }
@@ -543,7 +547,7 @@ static const AVCodec* find_h264_encoder(gsr_gpu_vendor vendor, const char *card_
     return checked_success ? codec : nullptr;
 }
 
-static const AVCodec* find_h265_encoder(gsr_gpu_vendor vendor, const char *card_path) {
+static const AVCodec* find_hevc_encoder(gsr_gpu_vendor vendor, const char *card_path) {
     const AVCodec *codec = avcodec_find_encoder_by_name(vendor == GSR_GPU_VENDOR_NVIDIA ? "hevc_nvenc" : "hevc_vaapi");
     if(!codec)
         codec = avcodec_find_encoder_by_name(vendor == GSR_GPU_VENDOR_NVIDIA ? "nvenc_hevc" : "vaapi_hevc");
@@ -624,8 +628,13 @@ static AVFrame* create_audio_frame(AVCodecContext *audio_codec_context) {
 }
 
 static void open_video(AVCodecContext *codec_context, VideoQuality video_quality, bool very_old_gpu, gsr_gpu_vendor vendor, PixelFormat pixel_format, bool hdr) {
+    (void)very_old_gpu;
     AVDictionary *options = nullptr;
+    // 8 bit / 10 bit = 80%
+    const float qp_multiply = hdr ? 8.0f/10.0f : 1.0f;
     if(vendor == GSR_GPU_VENDOR_NVIDIA) {
+        // Disable setting preset since some nvidia gpus cant handle it nicely and greatly reduce encoding performance (from more than 60 fps to less than 45 fps) (such as Nvidia RTX A2000)
+        #if 0
         bool supports_p4 = false;
         bool supports_p5 = false;
 
@@ -638,54 +647,56 @@ static void open_video(AVCodecContext *codec_context, VideoQuality video_quality
                     supports_p5 = true;
             }
         }
+        #endif
 
         if(codec_context->codec_id == AV_CODEC_ID_AV1) {
             switch(video_quality) {
                 case VideoQuality::MEDIUM:
-                    av_dict_set_int(&options, "qp", 37, 0);
+                    av_dict_set_int(&options, "qp", 37 * qp_multiply, 0);
                     break;
                 case VideoQuality::HIGH:
-                    av_dict_set_int(&options, "qp", 32, 0);
+                    av_dict_set_int(&options, "qp", 32 * qp_multiply, 0);
                     break;
                 case VideoQuality::VERY_HIGH:
-                    av_dict_set_int(&options, "qp", 28, 0);
+                    av_dict_set_int(&options, "qp", 28 * qp_multiply, 0);
                     break;
                 case VideoQuality::ULTRA:
-                    av_dict_set_int(&options, "qp", 24, 0);
+                    av_dict_set_int(&options, "qp", 24 * qp_multiply, 0);
                     break;
             }
-        } else if(very_old_gpu || codec_context->codec_id == AV_CODEC_ID_H264) {
+        } else if(codec_context->codec_id == AV_CODEC_ID_H264) {
             switch(video_quality) {
                 case VideoQuality::MEDIUM:
-                    av_dict_set_int(&options, "qp", 37, 0);
+                    av_dict_set_int(&options, "qp", 34 * qp_multiply, 0);
                     break;
                 case VideoQuality::HIGH:
-                    av_dict_set_int(&options, "qp", 32, 0);
+                    av_dict_set_int(&options, "qp", 30 * qp_multiply, 0);
                     break;
                 case VideoQuality::VERY_HIGH:
-                    av_dict_set_int(&options, "qp", 27, 0);
+                    av_dict_set_int(&options, "qp", 26 * qp_multiply, 0);
                     break;
                 case VideoQuality::ULTRA:
-                    av_dict_set_int(&options, "qp", 21, 0);
+                    av_dict_set_int(&options, "qp", 22 * qp_multiply, 0);
                     break;
             }
         } else {
             switch(video_quality) {
                 case VideoQuality::MEDIUM:
-                    av_dict_set_int(&options, "qp", 37, 0);
+                    av_dict_set_int(&options, "qp", 37 * qp_multiply, 0);
                     break;
                 case VideoQuality::HIGH:
-                    av_dict_set_int(&options, "qp", 32, 0);
+                    av_dict_set_int(&options, "qp", 32 * qp_multiply, 0);
                     break;
                 case VideoQuality::VERY_HIGH:
-                    av_dict_set_int(&options, "qp", 28, 0);
+                    av_dict_set_int(&options, "qp", 28 * qp_multiply, 0);
                     break;
                 case VideoQuality::ULTRA:
-                    av_dict_set_int(&options, "qp", 24, 0);
+                    av_dict_set_int(&options, "qp", 24 * qp_multiply, 0);
                     break;
             }
         }
 
+        #if 0
         if(!supports_p4 && !supports_p5)
             fprintf(stderr, "Info: your ffmpeg version is outdated. It's recommended that you use the flatpak version of gpu-screen-recorder version instead, which you can find at https://flathub.org/apps/details/com.dec05eba.gpu_screen_recorder\n");
 
@@ -708,6 +719,7 @@ static void open_video(AVCodecContext *codec_context, VideoQuality video_quality
             av_dict_set(&options, "preset", supports_p4 ? "p4" : "medium", 0);
         else
             av_dict_set(&options, "preset", supports_p5 ? "p5" : "slow", 0);
+        #endif
 
         av_dict_set(&options, "tune", "hq", 0);
         av_dict_set(&options, "rc", "constqp", 0);
@@ -745,31 +757,31 @@ static void open_video(AVCodecContext *codec_context, VideoQuality video_quality
         } else if(codec_context->codec_id == AV_CODEC_ID_H264) {
             switch(video_quality) {
                 case VideoQuality::MEDIUM:
-                    av_dict_set_int(&options, "qp", 34, 0);
+                    av_dict_set_int(&options, "qp", 34 * qp_multiply, 0);
                     break;
                 case VideoQuality::HIGH:
-                    av_dict_set_int(&options, "qp", 30, 0);
+                    av_dict_set_int(&options, "qp", 30 * qp_multiply, 0);
                     break;
                 case VideoQuality::VERY_HIGH:
-                    av_dict_set_int(&options, "qp", 26, 0);
+                    av_dict_set_int(&options, "qp", 26 * qp_multiply, 0);
                     break;
                 case VideoQuality::ULTRA:
-                    av_dict_set_int(&options, "qp", 22, 0);
+                    av_dict_set_int(&options, "qp", 22 * qp_multiply, 0);
                     break;
             }
         } else {
             switch(video_quality) {
                 case VideoQuality::MEDIUM:
-                    av_dict_set_int(&options, "qp", 37, 0);
+                    av_dict_set_int(&options, "qp", 37 * qp_multiply, 0);
                     break;
                 case VideoQuality::HIGH:
-                    av_dict_set_int(&options, "qp", 32, 0);
+                    av_dict_set_int(&options, "qp", 32 * qp_multiply, 0);
                     break;
                 case VideoQuality::VERY_HIGH:
-                    av_dict_set_int(&options, "qp", 28, 0);
+                    av_dict_set_int(&options, "qp", 28 * qp_multiply, 0);
                     break;
                 case VideoQuality::ULTRA:
-                    av_dict_set_int(&options, "qp", 24, 0);
+                    av_dict_set_int(&options, "qp", 24 * qp_multiply, 0);
                     break;
             }
         }
@@ -821,7 +833,7 @@ static void usage_full() {
     fprintf(stderr, "OPTIONS:\n");
     fprintf(stderr, "  -w    Window id to record, a display (monitor name), \"screen\", \"screen-direct-force\" or \"focused\".\n");
     fprintf(stderr, "        If this is \"screen\" or \"screen-direct-force\" then all monitors are recorded.\n");
-    fprintf(stderr, "        \"screen-direct-force\" is not recommended unless you use a VRR monitor on Nvidia X11 and you are aware that using this option can cause games to freeze/crash or other issues because of Nvidia driver issues.\n");
+    fprintf(stderr, "        \"screen-direct-force\" is not recommended unless you use a VRR (G-SYNC) monitor on Nvidia X11 and you are aware that using this option can cause games to freeze/crash or other issues because of Nvidia driver issues.\n");
     fprintf(stderr, "        \"screen-direct-force\" option is only available on Nvidia X11. VRR works without this option on other systems.\n");
     fprintf(stderr, "\n");
     fprintf(stderr, "  -c    Container format for output file, for example mp4, or flv. Only required if no output file is specified or if recording in replay buffer mode.\n");
@@ -849,10 +861,11 @@ static void usage_full() {
     fprintf(stderr, "        Forcefully set to 'h264' if the file container type is 'flv'.\n");
     fprintf(stderr, "        Forcefully set to 'hevc' on AMD/intel if video codec is 'h264' and if the file container type is 'mkv'.\n");
     fprintf(stderr, "        'hevc_hdr' and 'av1_hdr' option is not available on X11.\n");
-    fprintf(stderr, "        Note: hdr metadata is not included in the video when recording with 'hevc_hdr'/'av1_hdr' because of bugs in AMD, Intel and NVIDIA drivers (amazin', they are bugged).\n");
+    fprintf(stderr, "        Note: hdr metadata is not included in the video when recording with 'hevc_hdr'/'av1_hdr' because of bugs in AMD, Intel and NVIDIA drivers (amazin', they are all bugged).\n");
     fprintf(stderr, "\n");
     fprintf(stderr, "  -ac   Audio codec to use. Should be either 'aac', 'opus' or 'flac'. Defaults to 'opus' for .mp4/.mkv files, otherwise defaults to 'aac'.\n");
     fprintf(stderr, "        'opus' and 'flac' is only supported by .mp4/.mkv files. 'opus' is recommended for best performance and smallest audio size.\n");
+    fprintf(stderr, "        Flac audio codec is option is disable at the moment because of a temporary issue.\n");
     fprintf(stderr, "\n");
     fprintf(stderr, "  -ab   Audio bitrate to use. Optional, by default the bitrate is 128000 for opus and flac and 160000 for aac.\n");
     fprintf(stderr, "        If this is set to 0 then it's the same as if it's absent, in which case the bitrate is determined automatically depending on the audio codec.\n");
@@ -864,7 +877,7 @@ static void usage_full() {
     fprintf(stderr, "  -fm   Framerate mode. Should be either 'cfr' or 'vfr'. Defaults to 'vfr'.\n");
     fprintf(stderr, "\n");
     fprintf(stderr, "  -cr   Color range. Should be either 'limited' (aka mpeg) or 'full' (aka jpeg). Defaults to 'limited'.\n");
-    fprintf(stderr, "        Limited color range means that colors are in range 16-235 while full color range means that colors are in range 0-255 (when not recording with hdr).\n");
+    fprintf(stderr, "        Limited color range means that colors are in range 16-235 (4112-60395 for hdr) while full color range means that colors are in range 0-255 (0-65535 for hdr).\n");
     fprintf(stderr, "        Note that some buggy video players (such as vlc) are unable to correctly display videos in full color range.\n");
     fprintf(stderr, "\n");
     fprintf(stderr, "  -v    Prints per second, fps updates. Optional, set to 'yes' by default.\n");
@@ -1031,6 +1044,22 @@ static void run_recording_saved_script_async(const char *script_file, const char
     }
 }
 
+static double audio_codec_get_desired_delay(AudioCodec audio_codec, int fps) {
+    const double fps_inv = 1.0 / (double)fps;
+    const double base = 0.01 + 1.0/165.0;
+    switch(audio_codec) {
+        case AudioCodec::OPUS:
+            return std::max(0.0, base - fps_inv);
+        case AudioCodec::AAC:
+            return std::max(0.0, (base + 0.008) * 2.0 - fps_inv);
+        case AudioCodec::FLAC:
+            // TODO: Test
+            return std::max(0.0, base - fps_inv);
+    }
+    assert(false);
+    return std::max(0.0, base - fps_inv);
+}
+
 struct AudioDevice {
     SoundDevice sound_device;
     AudioInput audio_input;
@@ -1417,7 +1446,7 @@ static void list_supported_video_codecs() {
     // TODO: Output hdr
     if(find_h264_encoder(egl.gpu_info.vendor, card_path))
         puts("h264");
-    if(find_h265_encoder(egl.gpu_info.vendor, card_path))
+    if(find_hevc_encoder(egl.gpu_info.vendor, card_path))
         puts("hevc");
     if(find_av1_encoder(egl.gpu_info.vendor, card_path))
         puts("av1");
@@ -1707,10 +1736,10 @@ int main(int argc, char **argv) {
         usage();
     }
 
-    AudioCodec audio_codec = AudioCodec::AAC;
+    AudioCodec audio_codec = AudioCodec::OPUS;
     const char *audio_codec_to_use = args["-ac"].value();
     if(!audio_codec_to_use)
-        audio_codec_to_use = "aac";
+        audio_codec_to_use = "opus";
 
     if(strcmp(audio_codec_to_use, "aac") == 0) {
         audio_codec = AudioCodec::AAC;
@@ -1723,10 +1752,10 @@ int main(int argc, char **argv) {
         usage();
     }
 
-    if(audio_codec == AudioCodec::OPUS || audio_codec == AudioCodec::FLAC) {
-        fprintf(stderr, "Warning: opus and flac audio codecs are temporary disabled, using aac audio codec instead\n");
-        audio_codec_to_use = "aac";
-        audio_codec = AudioCodec::AAC;
+    if(audio_codec == AudioCodec::FLAC) {
+        fprintf(stderr, "Warning: flac audio codec is temporary disabled, using opus audio codec instead\n");
+        audio_codec_to_use = "opus";
+        audio_codec = AudioCodec::OPUS;
     }
 
     int audio_bitrate = 0;
@@ -2059,6 +2088,8 @@ int main(int argc, char **argv) {
             file_extension = file_extension.substr(0, comma_index);
     }
 
+    const bool force_no_audio_offset = file_extension == "ts" || file_extension == "flv";
+
     if(egl.gpu_info.vendor != GSR_GPU_VENDOR_NVIDIA && file_extension == "mkv" && strcmp(video_codec_to_use, "h264") == 0) {
         video_codec_to_use = "hevc";
         video_codec = VideoCodec::HEVC;
@@ -2085,6 +2116,7 @@ int main(int argc, char **argv) {
                 audio_codec = AudioCodec::AAC;
                 fprintf(stderr, "Warning: flac audio codec is only supported by .mp4 and .mkv files, falling back to aac instead\n");
             } else if(uses_amix) {
+                // TODO: remove this? is it true anymore?
                 audio_codec_to_use = "opus";
                 audio_codec = AudioCodec::OPUS;
                 fprintf(stderr, "Warning: flac audio codec is not supported when mixing audio sources, falling back to opus instead\n");
@@ -2109,16 +2141,18 @@ int main(int argc, char **argv) {
                 video_codec = VideoCodec::H264;
             }
         } else {
-            const AVCodec *h265_codec = find_h265_encoder(egl.gpu_info.vendor, egl.card_path);
+            const AVCodec *hevc_codec = find_hevc_encoder(egl.gpu_info.vendor, egl.card_path);
 
-            if(h265_codec && fps > 60) {
+            if(hevc_codec && fps > 60) {
                 fprintf(stderr, "Warning: recording at higher fps than 60 with hevc might result in recording at a very low fps. If this happens, switch to h264 or av1\n");
             }
 
+            // TODO: Default to h264 if resolution is around 1366x768 on AMD
+
             // hevc generally allows recording at a higher resolution than h264 on nvidia cards. On a gtx 1080 4k is the max resolution for h264 but for hevc it's 8k.
             // Another important info is that when recording at a higher fps than.. 60? hevc has very bad performance. For example when recording at 144 fps the fps drops to 1
             // while with h264 the fps doesn't drop.
-            if(!h265_codec) {
+            if(!hevc_codec) {
                 fprintf(stderr, "Info: using h264 encoder because a codec was not specified and your gpu does not support hevc\n");
                 video_codec_to_use = "h264";
                 video_codec = VideoCodec::H264;
@@ -2145,7 +2179,7 @@ int main(int argc, char **argv) {
             break;
         case VideoCodec::HEVC:
         case VideoCodec::HEVC_HDR:
-            video_codec_f = find_h265_encoder(egl.gpu_info.vendor, egl.card_path);
+            video_codec_f = find_hevc_encoder(egl.gpu_info.vendor, egl.card_path);
             break;
         case VideoCodec::AV1:
         case VideoCodec::AV1_HDR:
@@ -2159,7 +2193,7 @@ int main(int argc, char **argv) {
                 fprintf(stderr, "Warning: selected video codec h264 is not supported, trying hevc instead\n");
                 video_codec_to_use = "hevc";
                 video_codec = VideoCodec::HEVC;
-                video_codec_f = find_h265_encoder(egl.gpu_info.vendor, egl.card_path);
+                video_codec_f = find_hevc_encoder(egl.gpu_info.vendor, egl.card_path);
                 break;
             }
             case VideoCodec::HEVC:
@@ -2266,6 +2300,7 @@ int main(int argc, char **argv) {
     if(video_stream)
         avcodec_parameters_from_context(video_stream->codecpar, video_codec_context);
 
+    int audio_max_frame_size = 1024;
     int audio_stream_index = VIDEO_STREAM_INDEX + 1;
     for(const MergedAudioInputs &merged_audio_inputs : requested_audio_inputs) {
         const bool use_amix = merged_audio_inputs.audio_inputs.size() > 1;
@@ -2300,6 +2335,12 @@ int main(int argc, char **argv) {
 
         // TODO: Cleanup above
 
+        const double audio_fps = (double)audio_codec_context->sample_rate / (double)audio_codec_context->frame_size;
+        const double timeout_sec = 1000.0 / audio_fps / 1000.0;
+
+        const double audio_startup_time_seconds = force_no_audio_offset ? 0 : audio_codec_get_desired_delay(audio_codec, fps);// * ((double)audio_codec_context->frame_size / 1024.0);
+        const double num_audio_frames_shift = audio_startup_time_seconds / timeout_sec;
+
         std::vector<AudioDevice> audio_devices;
         for(size_t i = 0; i < merged_audio_inputs.audio_inputs.size(); ++i) {
             auto &audio_input = merged_audio_inputs.audio_inputs[i];
@@ -2322,7 +2363,7 @@ int main(int argc, char **argv) {
             }
 
             audio_device.frame = create_audio_frame(audio_codec_context);
-            audio_device.frame->pts = 0;
+            audio_device.frame->pts = -audio_codec_context->frame_size * num_audio_frames_shift;
 
             audio_devices.push_back(std::move(audio_device));
         }
@@ -2334,8 +2375,11 @@ int main(int argc, char **argv) {
         audio_track.graph = graph;
         audio_track.sink = sink;
         audio_track.stream_index = audio_stream_index;
+        audio_track.pts = -audio_codec_context->frame_size * num_audio_frames_shift;
         audio_tracks.push_back(std::move(audio_track));
         ++audio_stream_index;
+
+        audio_max_frame_size = std::max(audio_max_frame_size, audio_codec_context->frame_size);
     }
 
     //av_dump_format(av_format_context, 0, filename, 1);
@@ -2362,10 +2406,8 @@ int main(int argc, char **argv) {
         av_dict_free(&options);
     }
 
-    const double start_time_pts = clock_get_monotonic_seconds();
-
-    double start_time = clock_get_monotonic_seconds();
-    double frame_timer_start = start_time - target_fps; // We want to capture the first frame immediately
+    double fps_start_time = clock_get_monotonic_seconds();
+    double frame_timer_start = fps_start_time - target_fps; // We want to capture the first frame immediately
     int fps_counter = 0;
 
     bool paused = false;
@@ -2379,7 +2421,7 @@ int main(int argc, char **argv) {
     std::deque<std::shared_ptr<PacketData>> frame_data_queue;
     bool frames_erased = false;
 
-    const size_t audio_buffer_size = 1024 * 4 * 2; // max 4 bytes/sample, 2 channels
+    const size_t audio_buffer_size = audio_max_frame_size * 4 * 2; // max 4 bytes/sample, 2 channels
     uint8_t *empty_audio = (uint8_t*)malloc(audio_buffer_size);
     if(!empty_audio) {
         fprintf(stderr, "Error: failed to create empty audio\n");
@@ -2387,8 +2429,6 @@ int main(int argc, char **argv) {
     }
     memset(empty_audio, 0, audio_buffer_size);
 
-    const double audio_startup_time_seconds = std::max(0.0, 0.089166 - target_fps);
-
     for(AudioTrack &audio_track : audio_tracks) {
         for(AudioDevice &audio_device : audio_track.audio_devices) {
             audio_device.thread = std::thread([&]() mutable {
@@ -2416,9 +2456,11 @@ int main(int argc, char **argv) {
                     swr_init(swr);
                 }
 
-                double received_audio_time = clock_get_monotonic_seconds();
-                const double timeout_sec = 1000.0 / (double)audio_track.codec_context->sample_rate;
-                const int64_t timeout_ms = std::round(timeout_sec * 1000.0);
+                const double audio_fps = (double)audio_track.codec_context->sample_rate / (double)audio_track.codec_context->frame_size;
+                const int64_t timeout_ms = std::round(1000.0 / audio_fps);
+                const double timeout_sec = 1000.0 / audio_fps / 1000.0;
+                bool first_frame = true;
+                int64_t num_received_frames = 0;
 
                 while(running) {
                     void *sound_buffer;
@@ -2428,18 +2470,17 @@ int main(int argc, char **argv) {
                         // TODO: use this instead of calculating time to read. But this can fluctuate and we dont want to go back in time,
                         // also it's 0.0 for some users???
                         double latency_seconds = 0.0;
-                        sound_buffer_size = sound_device_read_next_chunk(&audio_device.sound_device, &sound_buffer, timeout_sec, &latency_seconds);
+                        sound_buffer_size = sound_device_read_next_chunk(&audio_device.sound_device, &sound_buffer, timeout_sec * 2.0, &latency_seconds);
                     }
 
                     const bool got_audio_data = sound_buffer_size >= 0;
+                    //fprintf(stderr, "got audio data: %s\n", got_audio_data ? "yes" : "no");
                     //const double time_after_read_seconds = clock_get_monotonic_seconds();
                     //const double time_to_read_seconds = time_after_read_seconds - time_before_read_seconds;
-                    const double this_audio_frame_time = (clock_get_monotonic_seconds() - audio_startup_time_seconds) - paused_time_offset;
+                    //fprintf(stderr, "time to read: %f, %s, %f\n", time_to_read_seconds, got_audio_data ? "yes" : "no", timeout_sec);
+                    const double this_audio_frame_time = clock_get_monotonic_seconds() - paused_time_offset;
 
                     if(paused) {
-                        if(got_audio_data)
-                            received_audio_time = this_audio_frame_time;
-
                         if(!audio_device.sound_device.handle)
                             usleep(timeout_ms * 1000);
 
@@ -2453,18 +2494,15 @@ int main(int argc, char **argv) {
                     }
 
                     // TODO: Is this |received_audio_time| really correct?
-                    const double prev_audio_time = received_audio_time;
-                    const double audio_receive_time_diff = this_audio_frame_time - received_audio_time;
-                    int64_t num_missing_frames = std::round(audio_receive_time_diff / timeout_sec);
+                    const int64_t num_expected_frames = std::round((this_audio_frame_time - record_start_time) / timeout_sec);
+                    int64_t num_missing_frames = std::max((int64_t)0LL, num_expected_frames - num_received_frames);
+
                     if(got_audio_data)
-                        num_missing_frames = std::max((int64_t)0, num_missing_frames - 1);
+                        num_missing_frames = std::max((int64_t)0LL, num_missing_frames - 1);
 
                     if(!audio_device.sound_device.handle)
                         num_missing_frames = std::max((int64_t)1, num_missing_frames);
 
-                    if(got_audio_data)
-                        received_audio_time = this_audio_frame_time;
-
                     // Fucking hell is there a better way to do this? I JUST WANT TO KEEP VIDEO AND AUDIO SYNCED HOLY FUCK I WANT TO KILL MYSELF NOW.
                     // THIS PIECE OF SHIT WANTS EMPTY FRAMES OTHERWISE VIDEO PLAYS TOO FAST TO KEEP UP WITH AUDIO OR THE AUDIO PLAYS TOO EARLY.
                     // BUT WE CANT USE DELAYS TO GIVE DUMMY DATA BECAUSE PULSEAUDIO MIGHT GIVE AUDIO A BIG DELAYED!!!
@@ -2472,23 +2510,20 @@ int main(int argc, char **argv) {
                     // videos because bad software such as video editing software and VLC do not support variable frame rate software,
                     // despite nvidia shadowplay and xbox game bar producing variable frame rate videos.
                     // So we have to make sure we produce frames at the same relative rate as the video.
-                    if(num_missing_frames >= 5 || !audio_device.sound_device.handle) {
+                    if((num_missing_frames >= 1 && got_audio_data) || num_missing_frames >= 5 || !audio_device.sound_device.handle) {
                         // TODO:
                         //audio_track.frame->data[0] = empty_audio;
-                        received_audio_time = this_audio_frame_time;
-                        if(needs_audio_conversion)
-                            swr_convert(swr, &audio_device.frame->data[0], audio_track.codec_context->frame_size, (const uint8_t**)&empty_audio, audio_track.codec_context->frame_size);
-                        else
-                            audio_device.frame->data[0] = empty_audio;
+                        if(first_frame || num_missing_frames >= 5) {
+                            if(needs_audio_conversion)
+                                swr_convert(swr, &audio_device.frame->data[0], audio_track.codec_context->frame_size, (const uint8_t**)&empty_audio, audio_track.codec_context->frame_size);
+                            else
+                                audio_device.frame->data[0] = empty_audio;
+                        }
+                        first_frame = false;
 
                         // TODO: Check if duplicate frame can be saved just by writing it with a different pts instead of sending it again
                         std::lock_guard<std::mutex> lock(audio_filter_mutex);
                         for(int i = 0; i < num_missing_frames; ++i) {
-                            const int64_t new_pts = ((prev_audio_time - record_start_time) + timeout_sec * i) * AV_TIME_BASE;
-                            if(new_pts == audio_device.frame->pts)
-                                continue;
-                            
-                            audio_device.frame->pts = new_pts;
                             if(audio_track.graph) {
                                 // TODO: av_buffersrc_add_frame
                                 if(av_buffersrc_write_frame(audio_device.src_filter_ctx, audio_device.frame) < 0) {
@@ -2503,6 +2538,9 @@ int main(int argc, char **argv) {
                                     fprintf(stderr, "Failed to encode audio!\n");
                                 }
                             }
+
+                            audio_device.frame->pts += audio_track.codec_context->frame_size;
+                            num_received_frames++;
                         }
                     }
 
@@ -2515,27 +2553,26 @@ int main(int argc, char **argv) {
                             swr_convert(swr, &audio_device.frame->data[0], audio_track.codec_context->frame_size, (const uint8_t**)&sound_buffer, audio_track.codec_context->frame_size);
                         else
                             audio_device.frame->data[0] = (uint8_t*)sound_buffer;
+                        first_frame = false;
 
-                        const int64_t new_pts = (this_audio_frame_time - record_start_time) * AV_TIME_BASE;
-                        if(new_pts != audio_device.frame->pts) {
-                            audio_device.frame->pts = new_pts;
-
-                            if(audio_track.graph) {
-                                std::lock_guard<std::mutex> lock(audio_filter_mutex);
-                                // TODO: av_buffersrc_add_frame
-                                if(av_buffersrc_write_frame(audio_device.src_filter_ctx, audio_device.frame) < 0) {
-                                    fprintf(stderr, "Error: failed to add audio frame to filter\n");
-                                }
+                        if(audio_track.graph) {
+                            std::lock_guard<std::mutex> lock(audio_filter_mutex);
+                            // TODO: av_buffersrc_add_frame
+                            if(av_buffersrc_write_frame(audio_device.src_filter_ctx, audio_device.frame) < 0) {
+                                fprintf(stderr, "Error: failed to add audio frame to filter\n");
+                            }
+                        } else {
+                            ret = avcodec_send_frame(audio_track.codec_context, audio_device.frame);
+                            if(ret >= 0) {
+                                // TODO: Move to separate thread because this could write to network (for example when livestreaming)
+                                receive_frames(audio_track.codec_context, audio_track.stream_index, audio_track.stream, audio_device.frame->pts, av_format_context, record_start_time, frame_data_queue, replay_buffer_size_secs, frames_erased, write_output_mutex, paused_time_offset);
                             } else {
-                                ret = avcodec_send_frame(audio_track.codec_context, audio_device.frame);
-                                if(ret >= 0) {
-                                    // TODO: Move to separate thread because this could write to network (for example when livestreaming)
-                                    receive_frames(audio_track.codec_context, audio_track.stream_index, audio_track.stream, audio_device.frame->pts, av_format_context, record_start_time, frame_data_queue, replay_buffer_size_secs, frames_erased, write_output_mutex, paused_time_offset);
-                                } else {
-                                    fprintf(stderr, "Failed to encode audio!\n");
-                                }
+                                fprintf(stderr, "Failed to encode audio!\n");
                             }
                         }
+
+                        audio_device.frame->pts += audio_track.codec_context->frame_size;
+                        num_received_frames++;
                     }
                 }
 
@@ -2565,6 +2602,7 @@ int main(int argc, char **argv) {
         }
         ++fps_counter;
 
+        // TODO: Move to another thread, since this shouldn't be locked to video encoding fps
         {
             std::lock_guard<std::mutex> lock(audio_filter_mutex);
             for(AudioTrack &audio_track : audio_tracks) {
@@ -2573,11 +2611,7 @@ int main(int argc, char **argv) {
 
                 int err = 0;
                 while ((err = av_buffersink_get_frame(audio_track.sink, aframe)) >= 0) {
-                    const double this_audio_frame_time = (clock_get_monotonic_seconds() - audio_startup_time_seconds) - paused_time_offset;
-                    const int64_t new_pts = (this_audio_frame_time - record_start_time) * AV_TIME_BASE;
-                    if(new_pts == aframe->pts)
-                        continue;
-                    aframe->pts = new_pts;
+                    aframe->pts = audio_track.pts;
                     err = avcodec_send_frame(audio_track.codec_context, aframe);
                     if(err >= 0){
                         // TODO: Move to separate thread because this could write to network (for example when livestreaming)
@@ -2586,18 +2620,19 @@ int main(int argc, char **argv) {
                         fprintf(stderr, "Failed to encode audio!\n");
                     }
                     av_frame_unref(aframe);
+                    audio_track.pts += audio_track.codec_context->frame_size;
                 }
             }
         }
 
         double time_now = clock_get_monotonic_seconds();
         double frame_timer_elapsed = time_now - frame_timer_start;
-        double elapsed = time_now - start_time;
+        double elapsed = time_now - fps_start_time;
         if (elapsed >= 1.0) {
             if(verbose) {
                 fprintf(stderr, "update fps: %d\n", fps_counter);
             }
-            start_time = time_now;
+            fps_start_time = time_now;
             fps_counter = 0;
         }
 
@@ -2607,7 +2642,7 @@ int main(int argc, char **argv) {
             frame_timer_start = time_now - frame_time_overflow;
 
             const double this_video_frame_time = clock_get_monotonic_seconds() - paused_time_offset;
-            const int64_t expected_frames = std::round((this_video_frame_time - start_time_pts) / target_fps);
+            const int64_t expected_frames = std::round((this_video_frame_time - record_start_time) / target_fps);
             const int num_frames = framerate_mode == FramerateMode::CONSTANT ? std::max((int64_t)0LL, expected_frames - video_pts_counter) : 1;
 
             if(num_frames > 0 && !paused) {