Time based audio latency, test, might fix some shits

author: dec05eba <dec05eba@protonmail.com> 2024-04-11 14:40:27 +0200
committer: dec05eba <dec05eba@protonmail.com> 2024-04-11 14:40:27 +0200
commit: 52688dad72542b7f3f7bce7a8ff0d7fd7827c5ea (patch)
tree: 01f1a4f8ff2209f15e1eed2d621e650a5cf44595 /src
parent: f8322c3c2838635d4a09b36811367b4dcdd7d751 (diff)
2 files changed, 70 insertions, 149 deletions
diff --git a/src/main.cpp b/src/main.cpp
index faab93b..b3b206c 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -315,7 +315,7 @@ static AVCodecContext* create_audio_codec_context(int fps, AudioCodec audio_code
 #endif
 
     codec_context->time_base.num = 1;
-    codec_context->time_base.den = codec_context->sample_rate;
+    codec_context->time_base.den = AV_TIME_BASE;
     codec_context->framerate.num = fps;
     codec_context->framerate.den = 1;
     codec_context->thread_count = 1;
@@ -1699,10 +1699,10 @@ int main(int argc, char **argv) {
         usage();
     }
 
-    AudioCodec audio_codec = AudioCodec::OPUS;
+    AudioCodec audio_codec = AudioCodec::AAC;
     const char *audio_codec_to_use = args["-ac"].value();
     if(!audio_codec_to_use)
-        audio_codec_to_use = "opus";
+        audio_codec_to_use = "aac";
 
     if(strcmp(audio_codec_to_use, "aac") == 0) {
         audio_codec = AudioCodec::AAC;
@@ -1715,10 +1715,10 @@ int main(int argc, char **argv) {
         usage();
     }
 
-    if(audio_codec == AudioCodec::FLAC) {
-        fprintf(stderr, "Warning: flac audio codec has been temporary disabled, using opus audio codec instead\n");
-        audio_codec_to_use = "opus";
-        audio_codec = AudioCodec::OPUS;
+    if(audio_codec == AudioCodec::OPUS || audio_codec == AudioCodec::FLAC) {
+        fprintf(stderr, "Warning: opus and flac audio codecs has been temporary disabled, using aac audio codec instead\n");
+        audio_codec_to_use = "aac";
+        audio_codec = AudioCodec::AAC;
     }
 
     bool overclock = false;
@@ -2397,58 +2397,21 @@ int main(int argc, char **argv) {
                     swr_init(swr);
                 }
 
-                const double target_audio_hz = 1.0 / (double)audio_track.codec_context->sample_rate;
-                double received_audio_time = clock_get_monotonic_seconds();
-                const int64_t timeout_ms = std::round((1000.0 / (double)audio_track.codec_context->sample_rate) * 1000.0);
-
-                // Remove this for now, it doesn't work well for everybody. The timing is different depending on system
-                #if 0
-                // Move audio forward by around 252 ms (for opus/aac), or 42ms for flac. This is just a shitty way to handle audio latency but pulseaudio latency calculation
-                // returns much lower value which isn't helpful.
-                if(needs_audio_conversion)
-                    swr_convert(swr, &audio_device.frame->data[0], audio_track.codec_context->frame_size, (const uint8_t**)&empty_audio, audio_track.codec_context->frame_size);
-                else
-                    audio_device.frame->data[0] = empty_audio;
-
-                int num_frames_to_delay = 12;
-                if(audio_codec == AudioCodec::FLAC)
-                    num_frames_to_delay = 2;
-
-                for(int i = 0; i < num_frames_to_delay; ++i) {
-                    if(audio_track.graph) {
-                        std::lock_guard<std::mutex> lock(audio_filter_mutex);
-                        // TODO: av_buffersrc_add_frame
-                        if(av_buffersrc_write_frame(audio_device.src_filter_ctx, audio_device.frame) < 0) {
-                            fprintf(stderr, "Error: failed to add audio frame to filter\n");
-                        }
-                    } else {
-                        int ret = avcodec_send_frame(audio_track.codec_context, audio_device.frame);
-                        if(ret >= 0) {
-                            // TODO: Move to separate thread because this could write to network (for example when livestreaming)
-                            receive_frames(audio_track.codec_context, audio_track.stream_index, audio_track.stream, audio_device.frame->pts, av_format_context, record_start_time, frame_data_queue, replay_buffer_size_secs, frames_erased, write_output_mutex, paused_time_offset);
-                        } else {
-                            fprintf(stderr, "Failed to encode audio!\n");
-                        }
-                    }
-                    audio_device.frame->pts += audio_track.codec_context->frame_size;
-                }
-                #endif
+                const int64_t no_input_sleep_ms = 500;
 
                 while(running) {
                     void *sound_buffer;
                     int sound_buffer_size = -1;
                     if(audio_device.sound_device.handle)
-                        sound_buffer_size = sound_device_read_next_chunk(&audio_device.sound_device, &sound_buffer);
+                        sound_buffer_size = sound_device_read_next_chunk(&audio_device.sound_device, &sound_buffer, 0.5);
+
                     const bool got_audio_data = sound_buffer_size >= 0;
 
                     const double this_audio_frame_time = clock_get_monotonic_seconds() - paused_time_offset;
 
                     if(paused) {
-                        if(got_audio_data)
-                            received_audio_time = this_audio_frame_time;
-
                         if(!audio_device.sound_device.handle)
-                            usleep(timeout_ms * 1000);
+                            usleep(no_input_sleep_ms * 1000);
 
                         continue;
                     }
@@ -2459,56 +2422,39 @@ int main(int argc, char **argv) {
                         break;
                     }
 
-                    // TODO: Is this |received_audio_time| really correct?
-                    int64_t num_missing_frames = std::round((this_audio_frame_time - received_audio_time) / target_audio_hz / (int64_t)audio_track.codec_context->frame_size);
-                    if(got_audio_data)
-                        num_missing_frames = std::max((int64_t)0, num_missing_frames - 1);
-
-                    if(!audio_device.sound_device.handle)
-                        num_missing_frames = std::max((int64_t)1, num_missing_frames);
-
-                    if(got_audio_data)
-                        received_audio_time = this_audio_frame_time;
-
-                    // Fucking hell is there a better way to do this? I JUST WANT TO KEEP VIDEO AND AUDIO SYNCED HOLY FUCK I WANT TO KILL MYSELF NOW.
-                    // THIS PIECE OF SHIT WANTS EMPTY FRAMES OTHERWISE VIDEO PLAYS TOO FAST TO KEEP UP WITH AUDIO OR THE AUDIO PLAYS TOO EARLY.
-                    // BUT WE CANT USE DELAYS TO GIVE DUMMY DATA BECAUSE PULSEAUDIO MIGHT GIVE AUDIO A BIG DELAYED!!!
-                    // This garbage is needed because we want to produce constant frame rate videos instead of variable frame rate
-                    // videos because bad software such as video editing software and VLC do not support variable frame rate software,
-                    // despite nvidia shadowplay and xbox game bar producing variable frame rate videos.
-                    // So we have to make sure we produce frames at the same relative rate as the video.
-                    if(num_missing_frames >= 5 || !audio_device.sound_device.handle) {
+                    if(!got_audio_data) {
                         // TODO:
                         //audio_track.frame->data[0] = empty_audio;
-                        received_audio_time = this_audio_frame_time;
                         if(needs_audio_conversion)
                             swr_convert(swr, &audio_device.frame->data[0], audio_track.codec_context->frame_size, (const uint8_t**)&empty_audio, audio_track.codec_context->frame_size);
                         else
                             audio_device.frame->data[0] = empty_audio;
 
-                        // TODO: Check if duplicate frame can be saved just by writing it with a different pts instead of sending it again
-                        std::lock_guard<std::mutex> lock(audio_filter_mutex);
-                        for(int i = 0; i < num_missing_frames; ++i) {
-                            if(audio_track.graph) {
-                                // TODO: av_buffersrc_add_frame
-                                if(av_buffersrc_write_frame(audio_device.src_filter_ctx, audio_device.frame) < 0) {
-                                    fprintf(stderr, "Error: failed to add audio frame to filter\n");
-                                }
+                        const int64_t new_pts = (this_audio_frame_time - record_start_time) * AV_TIME_BASE;
+                        if(new_pts == audio_device.frame->pts)
+                            continue;
+                        audio_device.frame->pts = new_pts;
+                        //audio_device.frame->linesize[0] = sound_buffer_size / 2;
+
+                        if(audio_track.graph) {
+                            std::lock_guard<std::mutex> lock(audio_filter_mutex);
+                            // TODO: av_buffersrc_add_frame
+                            if(av_buffersrc_write_frame(audio_device.src_filter_ctx, audio_device.frame) < 0) {
+                                fprintf(stderr, "Error: failed to add audio frame to filter\n");
+                            }
+                        } else {
+                            ret = avcodec_send_frame(audio_track.codec_context, audio_device.frame);
+                            if(ret >= 0) {
+                                // TODO: Move to separate thread because this could write to network (for example when livestreaming)
+                                receive_frames(audio_track.codec_context, audio_track.stream_index, audio_track.stream, audio_device.frame->pts, av_format_context, record_start_time, frame_data_queue, replay_buffer_size_secs, frames_erased, write_output_mutex, paused_time_offset);
                             } else {
-                                ret = avcodec_send_frame(audio_track.codec_context, audio_device.frame);
-                                if(ret >= 0) {
-                                    // TODO: Move to separate thread because this could write to network (for example when livestreaming)
-                                    receive_frames(audio_track.codec_context, audio_track.stream_index, audio_track.stream, audio_device.frame->pts, av_format_context, record_start_time, frame_data_queue, replay_buffer_size_secs, frames_erased, write_output_mutex, paused_time_offset);
-                                } else {
-                                    fprintf(stderr, "Failed to encode audio!\n");
-                                }
+                                fprintf(stderr, "Failed to encode audio!\n");
                             }
-                            audio_device.frame->pts += audio_track.codec_context->frame_size;
                         }
                     }
 
                     if(!audio_device.sound_device.handle)
-                        usleep(timeout_ms * 1000);
+                        usleep(no_input_sleep_ms * 1000);
 
                     if(got_audio_data) {
                         // TODO: Instead of converting audio, get float audio from alsa. Or does alsa do conversion internally to get this format?
@@ -2517,6 +2463,12 @@ int main(int argc, char **argv) {
                         else
                             audio_device.frame->data[0] = (uint8_t*)sound_buffer;
 
+                        const int64_t new_pts = (this_audio_frame_time - record_start_time) * AV_TIME_BASE;
+                        if(new_pts == audio_device.frame->pts)
+                            continue;
+                        audio_device.frame->pts = new_pts;
+                        //audio_device.frame->linesize[0] = sound_buffer_size / 2;
+
                         if(audio_track.graph) {
                             std::lock_guard<std::mutex> lock(audio_filter_mutex);
                             // TODO: av_buffersrc_add_frame
@@ -2532,8 +2484,6 @@ int main(int argc, char **argv) {
                                 fprintf(stderr, "Failed to encode audio!\n");
                             }
                         }
-
-                        audio_device.frame->pts += audio_track.codec_context->frame_size;
                     }
                 }
 
@@ -2571,7 +2521,11 @@ int main(int argc, char **argv) {
 
                 int err = 0;
                 while ((err = av_buffersink_get_frame(audio_track.sink, aframe)) >= 0) {
-                    aframe->pts = audio_track.pts;
+                    const int64_t new_pts = ((clock_get_monotonic_seconds() - paused_time_offset) - record_start_time) * AV_TIME_BASE;
+                    if(new_pts == aframe->pts)
+                        continue;
+                    aframe->pts = new_pts;
+                    //aframe->linesize[0] = sound_buffer_size / 2;
                     err = avcodec_send_frame(audio_track.codec_context, aframe);
                     if(err >= 0){
                         // TODO: Move to separate thread because this could write to network (for example when livestreaming)
@@ -2580,7 +2534,6 @@ int main(int argc, char **argv) {
                         fprintf(stderr, "Failed to encode audio!\n");
                     }
                     av_frame_unref(aframe);
-                    audio_track.pts += audio_track.codec_context->frame_size;
                 }
             }
         }
diff --git a/src/sound.cpp b/src/sound.cpp
index c3aa4d4..99342f2 100644
--- a/src/sound.cpp
+++ b/src/sound.cpp
@@ -41,6 +41,7 @@ struct pa_handle {
     size_t output_index, output_length;
 
     int operation_success;
+    double latency_seconds;
 };
 
 static void pa_sound_device_free(pa_handle *s) {
@@ -79,6 +80,7 @@ static pa_handle* pa_sound_device_new(const char *server,
     p->read_data = NULL;
     p->read_length = 0;
     p->read_index = 0;
+    p->latency_seconds = 0;
 
     const int buffer_size = attr->maxlength;
     void *buffer = malloc(buffer_size);
@@ -153,78 +155,41 @@ fail:
     return NULL;
 }
 
-// Returns a negative value on failure or if |p->output_length| data is not available within the time frame specified by the sample rate
-static int pa_sound_device_read(pa_handle *p) {
+static int pa_sound_device_read(pa_handle *p, double timeout_seconds) {
     assert(p);
 
-    const int64_t timeout_ms = std::round((1000.0 / (double)pa_stream_get_sample_spec(p->stream)->rate) * 1000.0);
     const double start_time = clock_get_monotonic_seconds();
 
-    bool success = false;
     int r = 0;
+    //pa_usec_t latency = 0;
+    //int negative = 0;
     int *rerror = &r;
     CHECK_DEAD_GOTO(p, rerror, fail);
 
-    while (p->output_index < p->output_length) {
-        if((clock_get_monotonic_seconds() - start_time) * 1000 >= timeout_ms)
-            return -1;
+    while(clock_get_monotonic_seconds() - start_time < timeout_seconds) {
+        pa_mainloop_prepare(p->mainloop, 1 * 1000);
+        pa_mainloop_poll(p->mainloop);
+        pa_mainloop_dispatch(p->mainloop);
 
-        if(!p->read_data) {
-            pa_mainloop_prepare(p->mainloop, 1 * 1000); // 1 ms
-            pa_mainloop_poll(p->mainloop);
-            pa_mainloop_dispatch(p->mainloop);
-
-            if(pa_stream_peek(p->stream, &p->read_data, &p->read_length) < 0)
-                goto fail;
-
-            if(!p->read_data && p->read_length == 0)
-                continue;
+        if(pa_stream_peek(p->stream, &p->read_data, &p->read_length) < 0)
+            goto fail;
 
-            if(!p->read_data && p->read_length > 0) {
-                // There is a hole in the stream :( drop it. Maybe we should generate silence instead? TODO
-                if(pa_stream_drop(p->stream) != 0)
-                    goto fail;
-                continue;
-            }
+        if(!p->read_data && p->read_length == 0)
+            continue;
 
-            if(p->read_length <= 0) {
-                p->read_data = NULL;
-                if(pa_stream_drop(p->stream) != 0)
-                    goto fail;
+        // pa_operation_unref(pa_stream_update_timing_info(p->stream, NULL, NULL));
+        // if (pa_stream_get_latency(p->stream, &latency, &negative) >= 0) {
+        //     fprintf(stderr, "latency: %lu ms, negative: %d, extra delay: %f ms\n", latency / 1000, negative, (clock_get_monotonic_seconds() - start_time) * 1000.0);
+        // }
 
-                CHECK_DEAD_GOTO(p, rerror, fail);
-                continue;
-            }
-        }
-
-        const size_t space_free_in_output_buffer = p->output_length - p->output_index;
-        if(space_free_in_output_buffer < p->read_length) {
-            memcpy(p->output_data + p->output_index, (const uint8_t*)p->read_data + p->read_index, space_free_in_output_buffer);
-            p->output_index = 0;
-            p->read_index += space_free_in_output_buffer;
-            p->read_length -= space_free_in_output_buffer;
-            break;
-        } else {
-            memcpy(p->output_data + p->output_index, (const uint8_t*)p->read_data + p->read_index, p->read_length);
-            p->output_index += p->read_length;
-            p->read_data = NULL;
-            p->read_length = 0;
-            p->read_index = 0;
-            
-            if(pa_stream_drop(p->stream) != 0)
-                goto fail;
-
-            if(p->output_index == p->output_length) {
-                p->output_index = 0;
-                break;
-            }
-        }
+        memcpy(p->output_data, p->read_data, p->read_length);
+        pa_stream_drop(p->stream);
+        p->latency_seconds = clock_get_monotonic_seconds() - start_time;
+        return p->read_length;
     }
 
-    success = true;
-
     fail:
-    return success ? 0 : -1;
+    return -1;
 }
 
 static pa_sample_format_t audio_format_to_pulse_audio_format(AudioFormat audio_format) {
@@ -269,6 +234,7 @@ int sound_device_get_by_name(SoundDevice *device, const char *device_name, const
 
     device->handle = handle;
     device->frames = period_frame_size;
+    device->latency_seconds = 0.0;
     return 0;
 }
 
@@ -278,14 +244,16 @@ void sound_device_close(SoundDevice *device) {
     device->handle = NULL;
 }
 
-int sound_device_read_next_chunk(SoundDevice *device, void **buffer) {
+int sound_device_read_next_chunk(SoundDevice *device, void **buffer, double timeout_sec) {
     pa_handle *pa = (pa_handle*)device->handle;
-    if(pa_sound_device_read(pa) < 0) {
+    int size = pa_sound_device_read(pa, timeout_sec);
+    if(size < 0) {
         //fprintf(stderr, "pa_simple_read() failed: %s\n", pa_strerror(error));
         return -1;
     }
     *buffer = pa->output_data;
-    return device->frames;
+    device->latency_seconds = pa->latency_seconds;
+    return size;
 }
 
 static void pa_state_cb(pa_context *c, void *userdata) {
author	dec05eba <dec05eba@protonmail.com>	2024-04-11 14:40:27 +0200
committer	dec05eba <dec05eba@protonmail.com>	2024-04-11 14:40:27 +0200
commit	52688dad72542b7f3f7bce7a8ff0d7fd7827c5ea (patch)
tree	01f1a4f8ff2209f15e1eed2d621e650a5cf44595 /src
parent	f8322c3c2838635d4a09b36811367b4dcdd7d751 (diff)