From d1920de838b17847b2e8a1520af0d82e670d5558 Mon Sep 17 00:00:00 2001 From: dec05eba Date: Tue, 11 May 2021 16:04:02 +0200 Subject: Parse content disposition better (when it contains encoding) --- include/NetUtils.hpp | 1 + src/DownloadUtils.cpp | 27 +++++++++++++++++---------- src/NetUtils.cpp | 40 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 58 insertions(+), 10 deletions(-) diff --git a/include/NetUtils.hpp b/include/NetUtils.hpp index bacafc7..a142884 100644 --- a/include/NetUtils.hpp +++ b/include/NetUtils.hpp @@ -12,6 +12,7 @@ namespace QuickMedia { void html_escape_sequences(std::string &str); void html_unescape_sequences(std::string &str); std::string url_param_encode(const std::string ¶m); + std::string url_param_decode(const std::string ¶m); std::vector extract_urls(const std::string &str); std::vector ranges_get_strings(const std::string &str, const std::vector &ranges); void convert_utf8_to_utf32_ranges(const std::string &str, std::vector &ranges); diff --git a/src/DownloadUtils.cpp b/src/DownloadUtils.cpp index a054454..5b5246a 100644 --- a/src/DownloadUtils.cpp +++ b/src/DownloadUtils.cpp @@ -73,7 +73,21 @@ namespace QuickMedia { fprintf(stderr, "Download duration for %s: %d ms\n", url.c_str(), timer.getElapsedTime().asMilliseconds()); std::string content_disposition = header_extract_value(header, "content-disposition"); - if(content_disposition.empty()) { + size_t filename_start = content_disposition.find("filename="); + if(filename_start == std::string::npos) { + // TODO: after filename*= the encoding type will follow. We need to support other formats than utf-8 as well + filename_start = content_disposition.find("filename*="); + if(filename_start != std::string::npos) { + filename_start += 10; + filename_start = content_disposition.find("''", filename_start); + if(filename_start != std::string::npos) + filename_start += 2; + } + } else { + filename_start += 9; + } + + if(filename_start == std::string::npos) { size_t filename_start = url.rfind('/'); if(filename_start == std::string::npos) { result = ""; @@ -90,16 +104,9 @@ namespace QuickMedia { } } - result = url.substr(filename_start, filename_end - filename_start); + result = url_param_decode(url.substr(filename_start, filename_end - filename_start)); return DownloadResult::OK; } else { - size_t filename_start = content_disposition.find("filename="); - if(filename_start == std::string::npos) { - result = ""; - return DownloadResult::OK; - } - - filename_start += 9; for(size_t i = filename_start; i < content_disposition.size(); ++i) { char c = content_disposition[i]; if(c != '"' && c != ' ') { @@ -117,7 +124,7 @@ namespace QuickMedia { } } - result = content_disposition.substr(filename_start, filename_end - filename_start); + result = url_param_decode(content_disposition.substr(filename_start, filename_end - filename_start)); return DownloadResult::OK; } } diff --git a/src/NetUtils.cpp b/src/NetUtils.cpp index dc7c2d2..de908b1 100644 --- a/src/NetUtils.cpp +++ b/src/NetUtils.cpp @@ -1563,6 +1563,18 @@ namespace QuickMedia { return c >= '0' && c <= '9'; } + // Returns -1 if its not a hex value + static int get_hex_value(char c) { + if(c >= '0' && c <= '9') + return c - '0'; + else if(c >= 'A' && c <= 'F') + return 10 + (c - 'A'); + else if(c >= 'a' && c <= 'f') + return 10 + (c - 'a'); + else + return -1; + } + static bool is_whitespace(char c) { return c == ' ' || c == '\t' || c == '\n'; } @@ -1584,6 +1596,34 @@ namespace QuickMedia { return result.str(); } + std::string url_param_decode(const std::string ¶m) { + std::string result; + for(int i = 0; i < (int)param.size();) { + char c = param[i]; + if(c == '%') { + if(i < (int)param.size() - 1 && param[i + 1] == '%') { + result += c; + i += 2; + } else if(i < (int)param.size() - 2) { + int first_c = get_hex_value(param[i + 1]); + int second_c = get_hex_value(param[i + 2]); + if(first_c != -1 && second_c != -1) { + result += (char)((first_c << 4) | second_c); + i += 3; + } else { + i += 1; + } + } else { + i += 1; + } + } else { + result += c; + i += 1; + } + } + return result; + } + static bool is_url_start_char(char c) { return is_alpha(c) || is_digit(c) || c == '-' || c == '.' || c == '_' || c == '~'; } -- cgit v1.2.3