aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authordec05eba <dec05eba@protonmail.com>2021-04-17 13:16:03 +0200
committerdec05eba <dec05eba@protonmail.com>2021-04-17 13:16:03 +0200
commit033fbf3f1363c810d115ce9a531aea26ea9e1cf1 (patch)
tree2a89743424201ac5195245694a1af609c2f7414e
parent23435ac30b25cd1a0a8d52c4a792d90f38300344 (diff)
Fix mangakatana search when mangakatana redirect to the exact match
-rw-r--r--include/DownloadUtils.hpp1
-rw-r--r--plugins/MangaGeneric.hpp13
-rw-r--r--src/DownloadUtils.cpp26
-rw-r--r--src/QuickMedia.cpp18
-rw-r--r--src/plugins/MangaGeneric.cpp135
5 files changed, 144 insertions, 49 deletions
diff --git a/include/DownloadUtils.hpp b/include/DownloadUtils.hpp
index dd74f50..0a68069 100644
--- a/include/DownloadUtils.hpp
+++ b/include/DownloadUtils.hpp
@@ -21,6 +21,7 @@ namespace QuickMedia {
// Return true the return DownloadResult::OK for the download, which also saves the result in cache if |download_to_string_cache| is used
using DownloadErrorHandler = std::function<bool(std::string&)>;
+ DownloadResult download_head_to_string(const std::string &url, std::string &result, bool use_browser_useragent = false, bool fail_on_error = true);
DownloadResult download_to_string(const std::string &url, std::string &result, const std::vector<CommandArg> &additional_args, bool use_browser_useragent = false, bool fail_on_error = true);
// Note: This function saves the content to the file atomically
DownloadResult download_to_string_cache(const std::string &url, std::string &result, const std::vector<CommandArg> &additional_args, bool use_browser_useragent = false, DownloadErrorHandler error_handler = nullptr, Path cache_path = "");
diff --git a/plugins/MangaGeneric.hpp b/plugins/MangaGeneric.hpp
index d58672a..4c99909 100644
--- a/plugins/MangaGeneric.hpp
+++ b/plugins/MangaGeneric.hpp
@@ -9,6 +9,8 @@ namespace QuickMedia {
int page_start = 0;
};
+ // If |url_contains| is null, then any matching query is added. If |title_field| is "text", then the inner text is used.
+ // If |url_field| is null, then the current page is used instead.
struct TextQuery {
const char *html_query = nullptr;
const char *title_field = nullptr;
@@ -16,6 +18,7 @@ namespace QuickMedia {
const char *url_contains = nullptr;
};
+ // If |field_contains| is null, then any matching query is added. If |field_name| is "text", then the inner text is used.
struct ThumbnailQuery {
const char *html_query = nullptr;
const char *field_name = nullptr;
@@ -92,12 +95,10 @@ namespace QuickMedia {
// example.com/search?q=%s&page=%p
// This is required.
MangaGenericSearchPage& search_handler(const char *search_template, int page_start);
- // If |url_contains| is null, then any matching query is added. If |title_field| is "text", then the inner text is used.
// This is required.
- MangaGenericSearchPage& text_handler(const char *html_query, const char *title_field, const char *url_field, const char *url_contains);
- // If |field_contains| is null, then any matching query is added. If |field_name| is "text", then the inner text is used.
+ MangaGenericSearchPage& text_handler(std::vector<TextQuery> queries);
// This is optional.
- MangaGenericSearchPage& thumbnail_handler(const char *html_query, const char *field_name, const char *field_contains);
+ MangaGenericSearchPage& thumbnail_handler(std::vector<ThumbnailQuery> queries);
// If |url_contains| is null, then any matching query is added. If |title_field| is "text", then the inner text is used.
// This is required.
@@ -130,8 +131,8 @@ namespace QuickMedia {
std::string website_url;
bool fail_on_http_error;
SearchQuery search_query;
- TextQuery text_query;
- ThumbnailQuery thumbnail_query;
+ std::vector<TextQuery> text_queries;
+ std::vector<ThumbnailQuery> thumbnail_queries;
ListChaptersQuery list_chapters_query;
ListPageQuery list_page_query;
MangaIdExtractor manga_id_extractor;
diff --git a/src/DownloadUtils.cpp b/src/DownloadUtils.cpp
index 336c154..e03bf5b 100644
--- a/src/DownloadUtils.cpp
+++ b/src/DownloadUtils.cpp
@@ -19,6 +19,32 @@ static int accumulate_string(char *data, int size, void *userdata) {
static const char *useragent_str = "user-agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36";
namespace QuickMedia {
+ DownloadResult download_head_to_string(const std::string &url, std::string &result, bool use_browser_useragent, bool fail_on_error) {
+ sf::Clock timer;
+ std::vector<const char*> args;
+ args.insert(args.end(), { "curl", "-I", "-H", "Accept-Language: en-US,en;q=0.5", "-H", "Connection: keep-alive", "--compressed", "-s" });
+ if(fail_on_error)
+ args.push_back("-f");
+ if(use_browser_useragent) {
+ args.push_back("-H");
+ args.push_back(useragent_str);
+ }
+ args.push_back("--");
+ args.push_back(url.c_str());
+ args.push_back(nullptr);
+ if(debug_download) {
+ for(const char *arg : args) {
+ if(arg)
+ fprintf(stderr, "'%s' ", arg);
+ }
+ fprintf(stderr, "\n");
+ }
+ if(exec_program(args.data(), accumulate_string, &result) != 0)
+ return DownloadResult::NET_ERR;
+ fprintf(stderr, "Download duration for %s: %d ms\n", url.c_str(), timer.getElapsedTime().asMilliseconds());
+ return DownloadResult::OK;
+ }
+
// TODO: Add timeout
DownloadResult download_to_string(const std::string &url, std::string &result, const std::vector<CommandArg> &additional_args, bool use_browser_useragent, bool fail_on_error) {
sf::Clock timer;
diff --git a/src/QuickMedia.cpp b/src/QuickMedia.cpp
index 44901e8..f85f2da 100644
--- a/src/QuickMedia.cpp
+++ b/src/QuickMedia.cpp
@@ -685,8 +685,8 @@ namespace QuickMedia {
static void add_manganelos_handlers(MangaGenericSearchPage *manga_generic_search_page) {
manga_generic_search_page->search_handler("http://manganelos.com/search?q=%s&page=%p", 1)
- .text_handler("//div[class='media-left cover-manga']//a", "title", "href", "/manga/")
- .thumbnail_handler("//div[class='media-left cover-manga']//img[class='media-object']", "src", "/mangaimage/")
+ .text_handler({{"//div[class='media-left cover-manga']//a", "title", "href", "/manga/"}})
+ .thumbnail_handler({{"//div[class='media-left cover-manga']//img[class='media-object']", "src", "/mangaimage/"}})
.list_chapters_handler("//section[id='examples']//div[class='chapter-list']//a", "text", "href", nullptr)
.list_page_images_handler("//p[id='arraydata']", "text", nullptr, [](std::vector<std::string> &urls) {
if(urls.size() != 1)
@@ -708,8 +708,8 @@ namespace QuickMedia {
static void add_mangatown_handlers(MangaGenericSearchPage *manga_generic_search_page) {
manga_generic_search_page->search_handler("https://mangatown.com/search?name=%s&page=%p", 1)
- .text_handler("//p[class='title']/a", "title", "href", "/manga/")
- .thumbnail_handler("//a[class='manga_cover']/img", "src", nullptr)
+ .text_handler({{"//p[class='title']/a", "title", "href", "/manga/"}})
+ .thumbnail_handler({{"//a[class='manga_cover']/img", "src", nullptr}})
.list_chapters_handler("//ul[class='chapter_list']//a", "text", "href", "/manga/")
.list_chapters_uploaded_time_handler("//ul[class='chapter_list']//span[class='time']", "text", nullptr)
.list_page_images_pagination_handler(
@@ -721,8 +721,14 @@ namespace QuickMedia {
static void add_mangakatana_handlers(MangaGenericSearchPage *manga_generic_search_page) {
manga_generic_search_page->search_handler("https://mangakatana.com/page/%p?search=%s&search_by=book_name", 1)
- .text_handler("//div[id='book_list']//h3[class='title']//a", "text", "href", "/manga/")
- .thumbnail_handler("//div[id='book_list']//img", "src", "/cover/")
+ .text_handler({
+ {"//div[id='book_list']//h3[class='title']//a", "text", "href", "/manga/"},
+ {"//div[id='single_book']//h1[class='heading']", "text", nullptr, nullptr}
+ })
+ .thumbnail_handler({
+ {"//div[id='book_list']//div[class='media']//img", "src", nullptr},
+ {"//div[id='single_book']//div[class='cover']//img", "src", nullptr}
+ })
.list_chapters_handler("//div[class='chapters']//div[class='chapter']//a", "text", "href", "/manga/")
.list_chapters_uploaded_time_handler("//div[class='chapters']//div[class='update_time']", "text", nullptr)
.list_page_images_custom_handler([](const std::string &html_source) {
diff --git a/src/plugins/MangaGeneric.cpp b/src/plugins/MangaGeneric.cpp
index 607488f..ebfbdda 100644
--- a/src/plugins/MangaGeneric.cpp
+++ b/src/plugins/MangaGeneric.cpp
@@ -58,11 +58,18 @@ namespace QuickMedia {
[](QuickMediaHtmlNode *node, void *userdata) {
HtmlSearchUserdata *search_userdata = (HtmlSearchUserdata*)userdata;
const char *field1_value = html_attr_or_inner_text(node, search_userdata->field1);
- const char *field2_value = html_attr_or_inner_text(node, search_userdata->field2);
- if(field1_value && field2_value && (!search_userdata->field2_contains || strstr(field2_value, search_userdata->field2_contains))) {
- auto item = BodyItem::create(strip(field1_value));
- item->url = strip(field2_value);
- search_userdata->body_items->push_back(std::move(item));
+ if(search_userdata->field2) {
+ const char *field2_value = html_attr_or_inner_text(node, search_userdata->field2);
+ if(field1_value && field2_value && (!search_userdata->field2_contains || strstr(field2_value, search_userdata->field2_contains))) {
+ auto item = BodyItem::create(strip(field1_value));
+ item->url = strip(field2_value);
+ search_userdata->body_items->push_back(std::move(item));
+ }
+ } else {
+ if(field1_value) {
+ auto item = BodyItem::create(strip(field1_value));
+ search_userdata->body_items->push_back(std::move(item));
+ }
}
}, search_userdata);
}
@@ -98,6 +105,34 @@ namespace QuickMedia {
}, page_image_userdata);
}
+ static size_t str_find_case_insensitive(const std::string &str, size_t start_index, const char *substr, size_t substr_len) {
+ auto it = std::search(str.begin() + start_index, str.end(), substr, substr + substr_len,
+ [](char c1, char c2) {
+ return std::toupper(c1) == std::toupper(c2);
+ });
+ if(it == str.end())
+ return std::string::npos;
+ return it - str.begin();
+ }
+
+ static std::string header_extract_location(const std::string &headers) {
+ size_t index = str_find_case_insensitive(headers, 0, "location:", 9);
+ if(index != std::string::npos && (index == 0 || headers[index - 1] == '\n')) {
+ index += 9;
+ size_t end = headers.find('\r', index);
+ size_t start = index;
+ while(start < end) {
+ char c = headers[start];
+ if(c != ' ' && c != '\t')
+ break;
+ ++start;
+ }
+ if(end - start > 0)
+ return headers.substr(start, end - start);
+ }
+ return "";
+ }
+
MangaGenericSearchPage::MangaGenericSearchPage(Program *program, const char *service_name, const char *website_url, bool fail_on_http_error) :
Page(program), service_name(service_name), website_url(website_url ? website_url : ""), fail_on_http_error(fail_on_http_error)
{
@@ -112,17 +147,7 @@ namespace QuickMedia {
}
PluginResult MangaGenericSearchPage::get_page(const std::string &str, int page, BodyItems &result_items) {
- if(!search_query.search_template || !text_query.html_query || !text_query.title_field || !text_query.url_field) {
- assert(false);
- return PluginResult::ERR;
- }
-
- HtmlSearchUserdata search_userdata;
- search_userdata.body_items = &result_items;
- search_userdata.field1 = text_query.title_field;
- search_userdata.field2 = text_query.url_field;
- search_userdata.field2_contains = text_query.url_contains;
-
+ std::string target_url;
std::string url = search_query.search_template;
string_replace_all(url, "%s", url_param_encode(str));
string_replace_all(url, "%p", std::to_string(search_query.page_start + page));
@@ -139,19 +164,60 @@ namespace QuickMedia {
if(result != 0)
goto cleanup;
- result = html_append_search(&html_search, text_query.html_query, &search_userdata);
- if(result != 0)
- goto cleanup;
+ for(const TextQuery &text_query : text_queries) {
+ if(!search_query.search_template || !text_query.html_query || !text_query.title_field) {
+ assert(false);
+ return PluginResult::ERR;
+ }
- assert(!thumbnail_query.html_query || thumbnail_query.field_name);
- if(thumbnail_query.html_query && thumbnail_query.field_name) {
- HtmlMergeUserdata merge_userdata;
- merge_userdata.type = MergeType::THUMBNAIL;
- merge_userdata.body_item_image_context.body_items = &result_items;
- merge_userdata.body_item_image_context.index = 0;
- merge_userdata.field_name = thumbnail_query.field_name;
- merge_userdata.field_contains = thumbnail_query.field_contains;
- result = html_body_item_merge(&html_search, thumbnail_query.html_query, &merge_userdata);
+ BodyItems new_result_items;
+ HtmlSearchUserdata search_userdata;
+ search_userdata.body_items = &new_result_items;
+ search_userdata.field1 = text_query.title_field;
+ search_userdata.field2 = text_query.url_field;
+ search_userdata.field2_contains = text_query.url_contains;
+
+ result = html_append_search(&html_search, text_query.html_query, &search_userdata);
+ if(result != 0)
+ goto cleanup;
+
+ for(const ThumbnailQuery &thumbnail_query : thumbnail_queries) {
+ assert(!thumbnail_query.html_query || thumbnail_query.field_name);
+ if(thumbnail_query.html_query && thumbnail_query.field_name) {
+ HtmlMergeUserdata merge_userdata;
+ merge_userdata.type = MergeType::THUMBNAIL;
+ merge_userdata.body_item_image_context.body_items = &new_result_items;
+ merge_userdata.body_item_image_context.index = 0;
+ merge_userdata.field_name = thumbnail_query.field_name;
+ merge_userdata.field_contains = thumbnail_query.field_contains;
+ result = html_body_item_merge(&html_search, thumbnail_query.html_query, &merge_userdata);
+ if(result != 0)
+ goto cleanup;
+ }
+ }
+
+ if(!text_query.url_field && !new_result_items.empty()) {
+ if(target_url.empty()) {
+ std::string response_headers;
+ DownloadResult download_result = download_head_to_string(url, response_headers, true);
+ if(download_result != DownloadResult::OK) {
+ result = -1;
+ goto cleanup;
+ }
+
+ target_url = header_extract_location(response_headers);
+ if(target_url.empty()) {
+ fprintf(stderr, "Failed to extract target location from %s HEAD\n", url.c_str());
+ result = -1;
+ goto cleanup;
+ }
+ }
+
+ for(auto &new_body_item : new_result_items) {
+ new_body_item->url = target_url;
+ }
+ }
+ result_items.insert(result_items.end(), std::move_iterator(new_result_items.begin()), std::move_iterator(new_result_items.end()));
}
for(auto &body_item : result_items) {
@@ -534,18 +600,13 @@ namespace QuickMedia {
return *this;
}
- MangaGenericSearchPage& MangaGenericSearchPage::text_handler(const char *html_query, const char *title_field, const char *url_field, const char *url_contains) {
- text_query.html_query = html_query;
- text_query.title_field = title_field;
- text_query.url_field = url_field;
- text_query.url_contains = url_contains;
+ MangaGenericSearchPage& MangaGenericSearchPage::text_handler(std::vector<TextQuery> queries) {
+ text_queries = std::move(queries);
return *this;
}
- MangaGenericSearchPage& MangaGenericSearchPage::thumbnail_handler(const char *html_query, const char *field_name, const char *field_contains) {
- thumbnail_query.html_query = html_query;
- thumbnail_query.field_name = field_name;
- thumbnail_query.field_contains = field_contains;
+ MangaGenericSearchPage& MangaGenericSearchPage::thumbnail_handler(std::vector<ThumbnailQuery> queries) {
+ thumbnail_queries = std::move(queries);
return *this;
}