From ba4e62d55156f9b94b569b56b6382bbcf94b7d86 Mon Sep 17 00:00:00 2001 From: dec05eba Date: Fri, 16 Apr 2021 09:37:53 +0200 Subject: Convert mangatown and manganelos into a generic manga plugin Revert for_each_page.. processing of manga instead of getting all pages. Mangatown requires you to navigate page by page, cant predict what a specific pages image url will be. --- src/plugins/MangaGeneric.cpp | 567 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 567 insertions(+) create mode 100644 src/plugins/MangaGeneric.cpp (limited to 'src/plugins/MangaGeneric.cpp') diff --git a/src/plugins/MangaGeneric.cpp b/src/plugins/MangaGeneric.cpp new file mode 100644 index 0000000..a359698 --- /dev/null +++ b/src/plugins/MangaGeneric.cpp @@ -0,0 +1,567 @@ +#include "../../plugins/MangaGeneric.hpp" +#include "../../include/StringUtils.hpp" +#include +#include + +namespace QuickMedia { + struct HtmlSearchUserdata { + BodyItems *body_items; + const char *field1 = nullptr; + const char *field2 = nullptr; + const char *field2_contains = nullptr; + }; + + enum class MergeType { + THUMBNAIL, + UPLOAD_TIME + }; + + struct HtmlMergeUserdata { + MergeType type; + BodyItemContext body_item_image_context; + const char *field_name = nullptr; + const char *field_contains = nullptr; + }; + + struct HtmlListPageImagesUserdata { + std::vector *urls; + const char *field_name = nullptr; + const char *field_contains = nullptr; + }; + + struct HtmlPageCountUserdata { + int num_pages = 0; + const char *field_name = nullptr; + const char *field_contains = nullptr; + }; + + struct HtmlPageImageUserdata { + std::string *url = nullptr; + const char *field_name = nullptr; + const char *field_contains = nullptr; + }; + + static const char* html_attr_or_inner_text(QuickMediaHtmlNode *node, const char *field_name) { + if(strcmp(field_name, "text") == 0) + return quickmedia_html_node_get_text(node); + else + return quickmedia_html_node_get_attribute_value(node, field_name); + } + + static bool starts_with(const std::string &str, const char *sub) { + size_t sub_len = strlen(sub); + return str.size() >= sub_len && memcmp(str.c_str(), sub, sub_len) == 0; + } + + static int html_append_search(QuickMediaHtmlSearch *html_search, const char *html_query, HtmlSearchUserdata *search_userdata) { + return quickmedia_html_find_nodes_xpath(html_search, html_query, + [](QuickMediaHtmlNode *node, void *userdata) { + HtmlSearchUserdata *search_userdata = (HtmlSearchUserdata*)userdata; + const char *field1_value = html_attr_or_inner_text(node, search_userdata->field1); + const char *field2_value = html_attr_or_inner_text(node, search_userdata->field2); + if(field1_value && field2_value && (!search_userdata->field2_contains || strstr(field2_value, search_userdata->field2_contains))) { + auto item = BodyItem::create(strip(field1_value)); + item->url = strip(field2_value); + search_userdata->body_items->push_back(std::move(item)); + } + }, search_userdata); + } + + static int html_body_item_merge(QuickMediaHtmlSearch *html_search, const char *html_query, HtmlMergeUserdata *merge_userdata) { + return quickmedia_html_find_nodes_xpath(html_search, html_query, + [](QuickMediaHtmlNode *node, void *userdata) { + HtmlMergeUserdata *merge_userdata = (HtmlMergeUserdata*)userdata; + BodyItemContext &body_item_image_context = merge_userdata->body_item_image_context; + const char *field_value = html_attr_or_inner_text(node, merge_userdata->field_name); + if(body_item_image_context.index < body_item_image_context.body_items->size() + && field_value && (!merge_userdata->field_contains || strstr(field_value, merge_userdata->field_contains))) + { + if(merge_userdata->type == MergeType::THUMBNAIL) { + (*body_item_image_context.body_items)[body_item_image_context.index]->thumbnail_url = strip(field_value); + } else if(merge_userdata->type == MergeType::UPLOAD_TIME) { + std::string uploaded_date = strip(field_value); + (*body_item_image_context.body_items)[body_item_image_context.index]->set_description("Uploaded: " + uploaded_date); + } + body_item_image_context.index++; + } + }, merge_userdata); + } + + static int html_get_page_url(QuickMediaHtmlSearch *html_search, const char *html_query, HtmlPageImageUserdata *page_image_userdata) { + return quickmedia_html_find_nodes_xpath(html_search, html_query, + [](QuickMediaHtmlNode *node, void *userdata) { + HtmlPageImageUserdata *page_image_userdata = (HtmlPageImageUserdata*)userdata; + const char *field1_value = html_attr_or_inner_text(node, page_image_userdata->field_name); + if(page_image_userdata->url->empty() && field1_value && (!page_image_userdata->field_contains || strstr(field1_value, page_image_userdata->field_contains))) { + *page_image_userdata->url = strip(field1_value); + } + }, page_image_userdata); + } + + MangaGenericSearchPage::MangaGenericSearchPage(Program *program, const char *service_name, const char *website_url) : Page(program), service_name(service_name), website_url(website_url ? website_url : "") + { + if(!this->website_url.empty()) { + if(this->website_url.back() != '/') + this->website_url.push_back('/'); + } + } + + SearchResult MangaGenericSearchPage::search(const std::string &str, BodyItems &result_items) { + return plugin_result_to_search_result(get_page(str, 0, result_items)); + } + + PluginResult MangaGenericSearchPage::get_page(const std::string &str, int page, BodyItems &result_items) { + if(!search_query.search_prefix || !search_query.page_prefix || !text_query.html_query || !text_query.title_field || !text_query.url_field) { + assert(false); + return PluginResult::ERR; + } + + HtmlSearchUserdata search_userdata; + search_userdata.body_items = &result_items; + search_userdata.field1 = text_query.title_field; + search_userdata.field2 = text_query.url_field; + search_userdata.field2_contains = text_query.url_contains; + + std::string url = search_query.search_prefix; + url += url_param_encode(str); + url += search_query.page_prefix + std::to_string(search_query.page_start + page); + + std::string website_data; + if(download_to_string(url, website_data, {}, true) != DownloadResult::OK) + return PluginResult::NET_ERR; + + if(website_data.empty()) + return PluginResult::OK; + + QuickMediaHtmlSearch html_search; + int result = quickmedia_html_search_init(&html_search, website_data.c_str()); + if(result != 0) + goto cleanup; + + result = html_append_search(&html_search, text_query.html_query, &search_userdata); + if(result != 0) + goto cleanup; + + assert(!thumbnail_query.html_query || thumbnail_query.field_name); + if(thumbnail_query.html_query && thumbnail_query.field_name) { + HtmlMergeUserdata merge_userdata; + merge_userdata.type = MergeType::THUMBNAIL; + merge_userdata.body_item_image_context.body_items = &result_items; + merge_userdata.body_item_image_context.index = 0; + merge_userdata.field_name = thumbnail_query.field_name; + merge_userdata.field_contains = thumbnail_query.field_contains; + result = html_body_item_merge(&html_search, thumbnail_query.html_query, &merge_userdata); + } + + for(auto &body_item : result_items) { + if(starts_with(body_item->url, "//")) + body_item->url = "https://" + body_item->url.substr(2); + else if(starts_with(body_item->url, "/")) + body_item->url = website_url + body_item->url.substr(1); + + if(starts_with(body_item->thumbnail_url, "//")) + body_item->thumbnail_url = "https://" + body_item->thumbnail_url.substr(2); + else if(starts_with(body_item->thumbnail_url, "/")) + body_item->thumbnail_url = website_url + body_item->thumbnail_url.substr(1); + } + + cleanup: + quickmedia_html_search_deinit(&html_search); + if(result == 0) { + return PluginResult::OK; + } else { + result_items.clear(); + return PluginResult::ERR; + } + } + + PluginResult MangaGenericSearchPage::submit(const std::string &title, const std::string &url, std::vector &result_tabs) { + if(!list_chapters_query.html_query || !list_chapters_query.title_field || !list_chapters_query.url_field) { + assert(false); + return PluginResult::ERR; + } + + BodyItems chapters_items; + HtmlSearchUserdata search_userdata; + search_userdata.body_items = &chapters_items; + search_userdata.field1 = list_chapters_query.title_field; + search_userdata.field2 = list_chapters_query.url_field; + search_userdata.field2_contains = list_chapters_query.url_contains; + + std::string website_data; + if(download_to_string(url, website_data, {}, true) != DownloadResult::OK) + return PluginResult::NET_ERR; + + QuickMediaHtmlSearch html_search; + int result = quickmedia_html_search_init(&html_search, website_data.c_str()); + if(result != 0) + goto cleanup; + + result = html_append_search(&html_search, list_chapters_query.html_query, &search_userdata); + if(result != 0) + goto cleanup; + + assert(!list_chapters_query.uploaded_time_html_query || list_chapters_query.uploaded_time_field_name); + if(list_chapters_query.uploaded_time_html_query && list_chapters_query.uploaded_time_field_name) { + HtmlMergeUserdata merge_userdata; + merge_userdata.type = MergeType::UPLOAD_TIME; + merge_userdata.body_item_image_context.body_items = &chapters_items; + merge_userdata.body_item_image_context.index = 0; + merge_userdata.field_name = list_chapters_query.uploaded_time_field_name; + merge_userdata.field_contains = list_chapters_query.uploaded_time_field_contains; + result = html_body_item_merge(&html_search, list_chapters_query.uploaded_time_html_query, &merge_userdata); + } + + for(auto &body_item : chapters_items) { + if(starts_with(body_item->url, "//")) + body_item->url = "https://" + body_item->url.substr(2); + else if(starts_with(body_item->url, "/")) + body_item->url = website_url + body_item->url.substr(1); + + if(starts_with(body_item->thumbnail_url, "//")) + body_item->thumbnail_url = "https://" + body_item->thumbnail_url.substr(2); + else if(starts_with(body_item->thumbnail_url, "/")) + body_item->thumbnail_url = website_url + body_item->thumbnail_url.substr(1); + } + + cleanup: + quickmedia_html_search_deinit(&html_search); + if(result != 0) + return PluginResult::ERR; + + auto body = create_body(); + body->items = std::move(chapters_items); + result_tabs.push_back(Tab{std::move(body), std::make_unique(program, title, url, manga_id_extractor, service_name, website_url, &list_page_query), create_search_bar("Search...", SEARCH_DELAY_FILTER)}); + return PluginResult::OK; + } + + PluginResult MangaGenericChaptersPage::submit(const std::string &title, const std::string &url, std::vector &result_tabs) { + result_tabs.push_back(Tab{nullptr, std::make_unique(program, content_title, title, url, service_name, website_url, list_page_query), nullptr}); + return PluginResult::OK; + } + + bool MangaGenericChaptersPage::extract_id_from_url(const std::string &url, std::string &manga_id) const { + size_t start_index = url.find(manga_id_extractor.prefix); + if(start_index == std::string::npos) + return false; + + if(!manga_id_extractor.end) { + manga_id = url.substr(start_index); + return true; + } + + start_index += strlen(manga_id_extractor.prefix); + size_t end_index = url.find(manga_id_extractor.end, start_index); + if(end_index == std::string::npos) { + manga_id = url.substr(start_index); + return true; + } + + manga_id = url.substr(start_index, end_index - start_index); + return true; + } + + ImageResult MangaGenericImagesPage::get_number_of_images(int &num_images) { + num_images = 0; + chapter_num_pages = -1; + switch(list_page_query->type) { + case ListPageQueryType::IMAGES: { + ImageResult result = get_page_image_urls(); + if(result != ImageResult::OK) return result; + num_images = chapter_image_urls.size(); + return ImageResult::OK; + } + case ListPageQueryType::PAGINATION: { + const ListPagePaginationQuery *list_page_pagination_query = &list_page_query->pagination_query; + if(!list_page_pagination_query->pages_html_query || !list_page_pagination_query->pages_field_name + || !list_page_pagination_query->image_html_query || !list_page_pagination_query->image_field_name + || !list_page_pagination_query->next_page_html_query || !list_page_pagination_query->next_page_field_name) + { + assert(false); + return ImageResult::ERR; + } + + if(chapter_num_pages != -1) { + num_images = chapter_num_pages; + return ImageResult::OK; + } + + current_image_url.clear(); + next_page_url.clear(); + + HtmlPageCountUserdata page_count_userdata; + page_count_userdata.num_pages = 0; + page_count_userdata.field_name = list_page_pagination_query->pages_field_name; + page_count_userdata.field_contains = list_page_pagination_query->pages_field_contains; + + HtmlPageImageUserdata page_image_userdata; + page_image_userdata.url = ¤t_image_url; + page_image_userdata.field_name = list_page_pagination_query->image_field_name; + page_image_userdata.field_contains = list_page_pagination_query->image_field_contains; + + HtmlPageImageUserdata next_page_userdata; + next_page_userdata.url = &next_page_url; + next_page_userdata.field_name = list_page_pagination_query->next_page_field_name; + next_page_userdata.field_contains = list_page_pagination_query->next_page_field_contains; + + std::string website_data; + if(download_to_string(url, website_data, {}, true) != DownloadResult::OK) + return ImageResult::NET_ERR; + + QuickMediaHtmlSearch html_search; + int result = quickmedia_html_search_init(&html_search, website_data.c_str()); + if(result != 0) + goto cleanup; + + result = quickmedia_html_find_nodes_xpath(&html_search, list_page_pagination_query->pages_html_query, + [](QuickMediaHtmlNode *node, void *userdata) { + HtmlPageCountUserdata *page_count_userdata = (HtmlPageCountUserdata*)userdata; + const char *field1_value = html_attr_or_inner_text(node, page_count_userdata->field_name); + if(field1_value && (!page_count_userdata->field_contains || strstr(field1_value, page_count_userdata->field_contains))) { + page_count_userdata->num_pages++; + } + }, &page_count_userdata); + + if(result == 0 && list_page_pagination_query->pages_post_handler) { + page_count_userdata.num_pages = list_page_pagination_query->pages_post_handler(page_count_userdata.num_pages); + } + + if(result != 0 || page_count_userdata.num_pages == 0) { + result = -1; + goto cleanup; + } + + result = html_get_page_url(&html_search, list_page_pagination_query->image_html_query, &page_image_userdata); + if(result != 0 || current_image_url.empty()) { + result = -1; + goto cleanup; + } + + result = html_get_page_url(&html_search, list_page_pagination_query->next_page_html_query, &next_page_userdata); + if(next_page_url.empty()) + result = -1; + + cleanup: + quickmedia_html_search_deinit(&html_search); + if(result != 0) { + current_image_url.clear(); + next_page_url.clear(); + return ImageResult::ERR; + } + + if(starts_with(current_image_url, "//")) + current_image_url = "https://" + current_image_url.substr(2); + else if(starts_with(current_image_url, "/")) + current_image_url = website_url + current_image_url.substr(1); + + num_images = page_count_userdata.num_pages; + chapter_num_pages = num_images; + return ImageResult::OK; + } + } + return ImageResult::OK; + } + + ImageResult MangaGenericImagesPage::for_each_page_in_chapter(PageCallback callback) { + switch(list_page_query->type) { + case ListPageQueryType::IMAGES: { + ImageResult result = get_page_image_urls(); + if(result != ImageResult::OK) return result; + for(const std::string &url : chapter_image_urls) { + if(!callback(url)) + break; + } + return ImageResult::OK; + } + case ListPageQueryType::PAGINATION: { + const ListPagePaginationQuery *list_page_pagination_query = &list_page_query->pagination_query; + if(!list_page_pagination_query->image_html_query || !list_page_pagination_query->image_field_name + || !list_page_pagination_query->next_page_html_query || !list_page_pagination_query->next_page_field_name) + { + assert(false); + return ImageResult::ERR; + } + + int num_images = 0; + ImageResult result = get_number_of_images(num_images); + if(result != ImageResult::OK) return result; + + if(!callback(current_image_url)) + return ImageResult::OK; + + for(int i = 0; i < num_images; ++i) { + std::string full_url = url + next_page_url; + current_image_url.clear(); + next_page_url.clear(); + + HtmlPageImageUserdata page_image_userdata; + page_image_userdata.url = ¤t_image_url; + page_image_userdata.field_name = list_page_pagination_query->image_field_name; + page_image_userdata.field_contains = list_page_pagination_query->image_field_contains; + + HtmlPageImageUserdata next_page_userdata; + next_page_userdata.url = &next_page_url; + next_page_userdata.field_name = list_page_pagination_query->next_page_field_name; + next_page_userdata.field_contains = list_page_pagination_query->next_page_field_contains; + + std::string image_src; + std::string website_data; + if(download_to_string_cache(full_url, website_data, {}, true) != DownloadResult::OK) + return ImageResult::ERR; + + QuickMediaHtmlSearch html_search; + int result = quickmedia_html_search_init(&html_search, website_data.c_str()); + if(result != 0) + goto cleanup; + + html_get_page_url(&html_search, list_page_pagination_query->image_html_query, &page_image_userdata); + html_get_page_url(&html_search, list_page_pagination_query->next_page_html_query, &next_page_userdata); + + cleanup: + quickmedia_html_search_deinit(&html_search); + + if(starts_with(current_image_url, "//")) + current_image_url = "https://" + current_image_url.substr(2); + else if(starts_with(current_image_url, "/")) + current_image_url = website_url + current_image_url.substr(1); + + if(!callback(current_image_url)) + break; + } + + return ImageResult::OK; + } + } + return ImageResult::OK; + } + + ImageResult MangaGenericImagesPage::get_page_image_urls() { + if(!prev_chapter_url.empty()) + return ImageResult::OK; + + assert(list_page_query->type == ListPageQueryType::IMAGES); + const ListPageImagesQuery *list_page_images_query = &list_page_query->images_query; + if(!list_page_images_query->html_query || !list_page_images_query->field_name) { + assert(false); + return ImageResult::ERR; + } + + HtmlListPageImagesUserdata list_page_images_userdata; + list_page_images_userdata.urls = &chapter_image_urls; + list_page_images_userdata.field_name = list_page_images_query->field_name; + list_page_images_userdata.field_contains = list_page_images_query->field_contains; + + std::string website_data; + if(download_to_string(url, website_data, {}, true) != DownloadResult::OK) + return ImageResult::NET_ERR; + + QuickMediaHtmlSearch html_search; + int result = quickmedia_html_search_init(&html_search, website_data.c_str()); + if(result != 0) + goto cleanup; + + result = quickmedia_html_find_nodes_xpath(&html_search, list_page_images_query->html_query, + [](QuickMediaHtmlNode *node, void *userdata) { + HtmlListPageImagesUserdata *list_page_images_userdata = (HtmlListPageImagesUserdata*)userdata; + const char *field1_value = html_attr_or_inner_text(node, list_page_images_userdata->field_name); + if(field1_value && (!list_page_images_userdata->field_contains || strstr(field1_value, list_page_images_userdata->field_contains))) { + list_page_images_userdata->urls->push_back(strip(field1_value)); + } + }, &list_page_images_userdata); + + if(result == 0 && !chapter_image_urls.empty() && list_page_images_query->post_handler) + list_page_images_query->post_handler(chapter_image_urls); + + for(std::string &url : chapter_image_urls) { + if(starts_with(url, "//")) + url = "https://" + url.substr(2); + else if(starts_with(url, "/")) + url = website_url + url.substr(1); + } + + cleanup: + quickmedia_html_search_deinit(&html_search); + if(result != 0 || chapter_image_urls.empty()) { + chapter_image_urls.clear(); + return ImageResult::ERR; + } + + prev_chapter_url = url; + return ImageResult::OK; + } + + + MangaGenericSearchPage& MangaGenericSearchPage::search_handler(const char *search_prefix, const char *page_prefix, int page_start) { + search_query.search_prefix = search_prefix; + search_query.page_prefix = page_prefix; + search_query.page_start = page_start; + return *this; + } + + MangaGenericSearchPage& MangaGenericSearchPage::text_handler(const char *html_query, const char *title_field, const char *url_field, const char *url_contains) { + text_query.html_query = html_query; + text_query.title_field = title_field; + text_query.url_field = url_field; + text_query.url_contains = url_contains; + return *this; + } + + MangaGenericSearchPage& MangaGenericSearchPage::thumbnail_handler(const char *html_query, const char *field_name, const char *field_contains) { + thumbnail_query.html_query = html_query; + thumbnail_query.field_name = field_name; + thumbnail_query.field_contains = field_contains; + return *this; + } + + MangaGenericSearchPage& MangaGenericSearchPage::list_chapters_handler(const char *html_query, const char *title_field, const char *url_field, const char *url_contains) { + list_chapters_query.html_query = html_query; + list_chapters_query.title_field = title_field; + list_chapters_query.url_field = url_field; + list_chapters_query.url_contains = url_contains; + return *this; + } + + MangaGenericSearchPage& MangaGenericSearchPage::list_chapters_uploaded_time_handler(const char *html_query, const char *field_name, const char *field_contains) { + list_chapters_query.uploaded_time_html_query = html_query; + list_chapters_query.uploaded_time_field_name = field_name; + list_chapters_query.uploaded_time_field_contains = field_contains; + return *this; + } + + MangaGenericSearchPage& MangaGenericSearchPage::list_page_images_handler(const char *html_query, const char *field_name, const char *field_contains, ListPageImagesQueryPost post_handler) { + list_page_query.type = ListPageQueryType::IMAGES; + list_page_query.images_query.html_query = html_query; + list_page_query.images_query.field_name = field_name; + list_page_query.images_query.field_contains = field_contains; + list_page_query.images_query.post_handler = post_handler; + return *this; + } + + MangaGenericSearchPage& MangaGenericSearchPage::list_page_images_pagination_handler( + const char *pages_html_query, const char *pages_field_name, const char *pages_field_contains, ListPagePaginationPagesPost pages_post_handler, + const char *image_html_query, const char *image_field_name, const char *image_field_contains, + const char *next_page_html_query, const char *next_page_field_name, const char *next_page_field_contains) + { + assert(pages_post_handler); + list_page_query.type = ListPageQueryType::PAGINATION; + list_page_query.pagination_query.pages_html_query = pages_html_query; + list_page_query.pagination_query.pages_field_name = pages_field_name; + list_page_query.pagination_query.pages_field_contains = pages_field_contains; + list_page_query.pagination_query.pages_post_handler = pages_post_handler; + + list_page_query.pagination_query.image_html_query = image_html_query; + list_page_query.pagination_query.image_field_name = image_field_name; + list_page_query.pagination_query.image_field_contains = image_field_contains; + + list_page_query.pagination_query.next_page_html_query = next_page_html_query; + list_page_query.pagination_query.next_page_field_name = next_page_field_name; + list_page_query.pagination_query.next_page_field_contains = next_page_field_contains; + return *this; + } + + MangaGenericSearchPage& MangaGenericSearchPage::manga_id_handler(const char *prefix, const char *end) { + manga_id_extractor.prefix = prefix; + manga_id_extractor.end = end; + return *this; + } +} \ No newline at end of file -- cgit v1.2.3