From 033fbf3f1363c810d115ce9a531aea26ea9e1cf1 Mon Sep 17 00:00:00 2001 From: dec05eba Date: Sat, 17 Apr 2021 13:16:03 +0200 Subject: Fix mangakatana search when mangakatana redirect to the exact match --- include/DownloadUtils.hpp | 1 + plugins/MangaGeneric.hpp | 13 +++-- src/DownloadUtils.cpp | 26 +++++++++ src/QuickMedia.cpp | 18 ++++-- src/plugins/MangaGeneric.cpp | 135 +++++++++++++++++++++++++++++++------------ 5 files changed, 144 insertions(+), 49 deletions(-) diff --git a/include/DownloadUtils.hpp b/include/DownloadUtils.hpp index dd74f50..0a68069 100644 --- a/include/DownloadUtils.hpp +++ b/include/DownloadUtils.hpp @@ -21,6 +21,7 @@ namespace QuickMedia { // Return true the return DownloadResult::OK for the download, which also saves the result in cache if |download_to_string_cache| is used using DownloadErrorHandler = std::function; + DownloadResult download_head_to_string(const std::string &url, std::string &result, bool use_browser_useragent = false, bool fail_on_error = true); DownloadResult download_to_string(const std::string &url, std::string &result, const std::vector &additional_args, bool use_browser_useragent = false, bool fail_on_error = true); // Note: This function saves the content to the file atomically DownloadResult download_to_string_cache(const std::string &url, std::string &result, const std::vector &additional_args, bool use_browser_useragent = false, DownloadErrorHandler error_handler = nullptr, Path cache_path = ""); diff --git a/plugins/MangaGeneric.hpp b/plugins/MangaGeneric.hpp index d58672a..4c99909 100644 --- a/plugins/MangaGeneric.hpp +++ b/plugins/MangaGeneric.hpp @@ -9,6 +9,8 @@ namespace QuickMedia { int page_start = 0; }; + // If |url_contains| is null, then any matching query is added. If |title_field| is "text", then the inner text is used. + // If |url_field| is null, then the current page is used instead. struct TextQuery { const char *html_query = nullptr; const char *title_field = nullptr; @@ -16,6 +18,7 @@ namespace QuickMedia { const char *url_contains = nullptr; }; + // If |field_contains| is null, then any matching query is added. If |field_name| is "text", then the inner text is used. struct ThumbnailQuery { const char *html_query = nullptr; const char *field_name = nullptr; @@ -92,12 +95,10 @@ namespace QuickMedia { // example.com/search?q=%s&page=%p // This is required. MangaGenericSearchPage& search_handler(const char *search_template, int page_start); - // If |url_contains| is null, then any matching query is added. If |title_field| is "text", then the inner text is used. // This is required. - MangaGenericSearchPage& text_handler(const char *html_query, const char *title_field, const char *url_field, const char *url_contains); - // If |field_contains| is null, then any matching query is added. If |field_name| is "text", then the inner text is used. + MangaGenericSearchPage& text_handler(std::vector queries); // This is optional. - MangaGenericSearchPage& thumbnail_handler(const char *html_query, const char *field_name, const char *field_contains); + MangaGenericSearchPage& thumbnail_handler(std::vector queries); // If |url_contains| is null, then any matching query is added. If |title_field| is "text", then the inner text is used. // This is required. @@ -130,8 +131,8 @@ namespace QuickMedia { std::string website_url; bool fail_on_http_error; SearchQuery search_query; - TextQuery text_query; - ThumbnailQuery thumbnail_query; + std::vector text_queries; + std::vector thumbnail_queries; ListChaptersQuery list_chapters_query; ListPageQuery list_page_query; MangaIdExtractor manga_id_extractor; diff --git a/src/DownloadUtils.cpp b/src/DownloadUtils.cpp index 336c154..e03bf5b 100644 --- a/src/DownloadUtils.cpp +++ b/src/DownloadUtils.cpp @@ -19,6 +19,32 @@ static int accumulate_string(char *data, int size, void *userdata) { static const char *useragent_str = "user-agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36"; namespace QuickMedia { + DownloadResult download_head_to_string(const std::string &url, std::string &result, bool use_browser_useragent, bool fail_on_error) { + sf::Clock timer; + std::vector args; + args.insert(args.end(), { "curl", "-I", "-H", "Accept-Language: en-US,en;q=0.5", "-H", "Connection: keep-alive", "--compressed", "-s" }); + if(fail_on_error) + args.push_back("-f"); + if(use_browser_useragent) { + args.push_back("-H"); + args.push_back(useragent_str); + } + args.push_back("--"); + args.push_back(url.c_str()); + args.push_back(nullptr); + if(debug_download) { + for(const char *arg : args) { + if(arg) + fprintf(stderr, "'%s' ", arg); + } + fprintf(stderr, "\n"); + } + if(exec_program(args.data(), accumulate_string, &result) != 0) + return DownloadResult::NET_ERR; + fprintf(stderr, "Download duration for %s: %d ms\n", url.c_str(), timer.getElapsedTime().asMilliseconds()); + return DownloadResult::OK; + } + // TODO: Add timeout DownloadResult download_to_string(const std::string &url, std::string &result, const std::vector &additional_args, bool use_browser_useragent, bool fail_on_error) { sf::Clock timer; diff --git a/src/QuickMedia.cpp b/src/QuickMedia.cpp index 44901e8..f85f2da 100644 --- a/src/QuickMedia.cpp +++ b/src/QuickMedia.cpp @@ -685,8 +685,8 @@ namespace QuickMedia { static void add_manganelos_handlers(MangaGenericSearchPage *manga_generic_search_page) { manga_generic_search_page->search_handler("http://manganelos.com/search?q=%s&page=%p", 1) - .text_handler("//div[class='media-left cover-manga']//a", "title", "href", "/manga/") - .thumbnail_handler("//div[class='media-left cover-manga']//img[class='media-object']", "src", "/mangaimage/") + .text_handler({{"//div[class='media-left cover-manga']//a", "title", "href", "/manga/"}}) + .thumbnail_handler({{"//div[class='media-left cover-manga']//img[class='media-object']", "src", "/mangaimage/"}}) .list_chapters_handler("//section[id='examples']//div[class='chapter-list']//a", "text", "href", nullptr) .list_page_images_handler("//p[id='arraydata']", "text", nullptr, [](std::vector &urls) { if(urls.size() != 1) @@ -708,8 +708,8 @@ namespace QuickMedia { static void add_mangatown_handlers(MangaGenericSearchPage *manga_generic_search_page) { manga_generic_search_page->search_handler("https://mangatown.com/search?name=%s&page=%p", 1) - .text_handler("//p[class='title']/a", "title", "href", "/manga/") - .thumbnail_handler("//a[class='manga_cover']/img", "src", nullptr) + .text_handler({{"//p[class='title']/a", "title", "href", "/manga/"}}) + .thumbnail_handler({{"//a[class='manga_cover']/img", "src", nullptr}}) .list_chapters_handler("//ul[class='chapter_list']//a", "text", "href", "/manga/") .list_chapters_uploaded_time_handler("//ul[class='chapter_list']//span[class='time']", "text", nullptr) .list_page_images_pagination_handler( @@ -721,8 +721,14 @@ namespace QuickMedia { static void add_mangakatana_handlers(MangaGenericSearchPage *manga_generic_search_page) { manga_generic_search_page->search_handler("https://mangakatana.com/page/%p?search=%s&search_by=book_name", 1) - .text_handler("//div[id='book_list']//h3[class='title']//a", "text", "href", "/manga/") - .thumbnail_handler("//div[id='book_list']//img", "src", "/cover/") + .text_handler({ + {"//div[id='book_list']//h3[class='title']//a", "text", "href", "/manga/"}, + {"//div[id='single_book']//h1[class='heading']", "text", nullptr, nullptr} + }) + .thumbnail_handler({ + {"//div[id='book_list']//div[class='media']//img", "src", nullptr}, + {"//div[id='single_book']//div[class='cover']//img", "src", nullptr} + }) .list_chapters_handler("//div[class='chapters']//div[class='chapter']//a", "text", "href", "/manga/") .list_chapters_uploaded_time_handler("//div[class='chapters']//div[class='update_time']", "text", nullptr) .list_page_images_custom_handler([](const std::string &html_source) { diff --git a/src/plugins/MangaGeneric.cpp b/src/plugins/MangaGeneric.cpp index 607488f..ebfbdda 100644 --- a/src/plugins/MangaGeneric.cpp +++ b/src/plugins/MangaGeneric.cpp @@ -58,11 +58,18 @@ namespace QuickMedia { [](QuickMediaHtmlNode *node, void *userdata) { HtmlSearchUserdata *search_userdata = (HtmlSearchUserdata*)userdata; const char *field1_value = html_attr_or_inner_text(node, search_userdata->field1); - const char *field2_value = html_attr_or_inner_text(node, search_userdata->field2); - if(field1_value && field2_value && (!search_userdata->field2_contains || strstr(field2_value, search_userdata->field2_contains))) { - auto item = BodyItem::create(strip(field1_value)); - item->url = strip(field2_value); - search_userdata->body_items->push_back(std::move(item)); + if(search_userdata->field2) { + const char *field2_value = html_attr_or_inner_text(node, search_userdata->field2); + if(field1_value && field2_value && (!search_userdata->field2_contains || strstr(field2_value, search_userdata->field2_contains))) { + auto item = BodyItem::create(strip(field1_value)); + item->url = strip(field2_value); + search_userdata->body_items->push_back(std::move(item)); + } + } else { + if(field1_value) { + auto item = BodyItem::create(strip(field1_value)); + search_userdata->body_items->push_back(std::move(item)); + } } }, search_userdata); } @@ -98,6 +105,34 @@ namespace QuickMedia { }, page_image_userdata); } + static size_t str_find_case_insensitive(const std::string &str, size_t start_index, const char *substr, size_t substr_len) { + auto it = std::search(str.begin() + start_index, str.end(), substr, substr + substr_len, + [](char c1, char c2) { + return std::toupper(c1) == std::toupper(c2); + }); + if(it == str.end()) + return std::string::npos; + return it - str.begin(); + } + + static std::string header_extract_location(const std::string &headers) { + size_t index = str_find_case_insensitive(headers, 0, "location:", 9); + if(index != std::string::npos && (index == 0 || headers[index - 1] == '\n')) { + index += 9; + size_t end = headers.find('\r', index); + size_t start = index; + while(start < end) { + char c = headers[start]; + if(c != ' ' && c != '\t') + break; + ++start; + } + if(end - start > 0) + return headers.substr(start, end - start); + } + return ""; + } + MangaGenericSearchPage::MangaGenericSearchPage(Program *program, const char *service_name, const char *website_url, bool fail_on_http_error) : Page(program), service_name(service_name), website_url(website_url ? website_url : ""), fail_on_http_error(fail_on_http_error) { @@ -112,17 +147,7 @@ namespace QuickMedia { } PluginResult MangaGenericSearchPage::get_page(const std::string &str, int page, BodyItems &result_items) { - if(!search_query.search_template || !text_query.html_query || !text_query.title_field || !text_query.url_field) { - assert(false); - return PluginResult::ERR; - } - - HtmlSearchUserdata search_userdata; - search_userdata.body_items = &result_items; - search_userdata.field1 = text_query.title_field; - search_userdata.field2 = text_query.url_field; - search_userdata.field2_contains = text_query.url_contains; - + std::string target_url; std::string url = search_query.search_template; string_replace_all(url, "%s", url_param_encode(str)); string_replace_all(url, "%p", std::to_string(search_query.page_start + page)); @@ -139,19 +164,60 @@ namespace QuickMedia { if(result != 0) goto cleanup; - result = html_append_search(&html_search, text_query.html_query, &search_userdata); - if(result != 0) - goto cleanup; + for(const TextQuery &text_query : text_queries) { + if(!search_query.search_template || !text_query.html_query || !text_query.title_field) { + assert(false); + return PluginResult::ERR; + } - assert(!thumbnail_query.html_query || thumbnail_query.field_name); - if(thumbnail_query.html_query && thumbnail_query.field_name) { - HtmlMergeUserdata merge_userdata; - merge_userdata.type = MergeType::THUMBNAIL; - merge_userdata.body_item_image_context.body_items = &result_items; - merge_userdata.body_item_image_context.index = 0; - merge_userdata.field_name = thumbnail_query.field_name; - merge_userdata.field_contains = thumbnail_query.field_contains; - result = html_body_item_merge(&html_search, thumbnail_query.html_query, &merge_userdata); + BodyItems new_result_items; + HtmlSearchUserdata search_userdata; + search_userdata.body_items = &new_result_items; + search_userdata.field1 = text_query.title_field; + search_userdata.field2 = text_query.url_field; + search_userdata.field2_contains = text_query.url_contains; + + result = html_append_search(&html_search, text_query.html_query, &search_userdata); + if(result != 0) + goto cleanup; + + for(const ThumbnailQuery &thumbnail_query : thumbnail_queries) { + assert(!thumbnail_query.html_query || thumbnail_query.field_name); + if(thumbnail_query.html_query && thumbnail_query.field_name) { + HtmlMergeUserdata merge_userdata; + merge_userdata.type = MergeType::THUMBNAIL; + merge_userdata.body_item_image_context.body_items = &new_result_items; + merge_userdata.body_item_image_context.index = 0; + merge_userdata.field_name = thumbnail_query.field_name; + merge_userdata.field_contains = thumbnail_query.field_contains; + result = html_body_item_merge(&html_search, thumbnail_query.html_query, &merge_userdata); + if(result != 0) + goto cleanup; + } + } + + if(!text_query.url_field && !new_result_items.empty()) { + if(target_url.empty()) { + std::string response_headers; + DownloadResult download_result = download_head_to_string(url, response_headers, true); + if(download_result != DownloadResult::OK) { + result = -1; + goto cleanup; + } + + target_url = header_extract_location(response_headers); + if(target_url.empty()) { + fprintf(stderr, "Failed to extract target location from %s HEAD\n", url.c_str()); + result = -1; + goto cleanup; + } + } + + for(auto &new_body_item : new_result_items) { + new_body_item->url = target_url; + } + } + result_items.insert(result_items.end(), std::move_iterator(new_result_items.begin()), std::move_iterator(new_result_items.end())); } for(auto &body_item : result_items) { @@ -534,18 +600,13 @@ namespace QuickMedia { return *this; } - MangaGenericSearchPage& MangaGenericSearchPage::text_handler(const char *html_query, const char *title_field, const char *url_field, const char *url_contains) { - text_query.html_query = html_query; - text_query.title_field = title_field; - text_query.url_field = url_field; - text_query.url_contains = url_contains; + MangaGenericSearchPage& MangaGenericSearchPage::text_handler(std::vector queries) { + text_queries = std::move(queries); return *this; } - MangaGenericSearchPage& MangaGenericSearchPage::thumbnail_handler(const char *html_query, const char *field_name, const char *field_contains) { - thumbnail_query.html_query = html_query; - thumbnail_query.field_name = field_name; - thumbnail_query.field_contains = field_contains; + MangaGenericSearchPage& MangaGenericSearchPage::thumbnail_handler(std::vector queries) { + thumbnail_queries = std::move(queries); return *this; } -- cgit v1.2.3