From c470174b397c7a81b7510f191e404cb895d462af Mon Sep 17 00:00:00 2001 From: dec05eba Date: Sat, 3 Oct 2020 08:58:11 +0200 Subject: Nyaa.si: parse html instead of rss, allows for pagination --- src/plugins/NyaaSi.cpp | 208 +++++++++++++++++++++++++++++++++---------------- 1 file changed, 140 insertions(+), 68 deletions(-) (limited to 'src/plugins') diff --git a/src/plugins/NyaaSi.cpp b/src/plugins/NyaaSi.cpp index 8e13789..98c6eee 100644 --- a/src/plugins/NyaaSi.cpp +++ b/src/plugins/NyaaSi.cpp @@ -3,19 +3,30 @@ #include namespace QuickMedia { - // Returns empty string on error - static std::string get_rss_item_text(const std::string &data, size_t start, size_t end, const std::string &tag_start, const std::string &tag_end) { - size_t item_begin = data.find(tag_start, start); - if(item_begin == std::string::npos || item_begin >= end) - return ""; - - size_t item_end = data.find(tag_end, item_begin + tag_start.size()); - if(item_end == std::string::npos || item_end >= end) - return ""; - - std::string result = data.substr(item_begin + tag_start.size(), item_end - (item_begin + tag_start.size())); - html_unescape_sequences(result); - return strip(result); + // Return end of td tag, or std::string::npos + static size_t find_td_get_value(const std::string &str, size_t start_index, size_t end_index, std::string &result) { + size_t td_begin = str.find("= end_index) + return std::string::npos; + + size_t td_end = str.find("", td_begin + 3); + if(td_end == std::string::npos || td_end >= end_index) + return std::string::npos; + + size_t value_begin = str.find('>', td_begin + 3); + if(value_begin == std::string::npos || value_begin >= td_end) + return std::string::npos; + + result = str.substr(value_begin + 1, td_end - (value_begin + 1)); + return td_end + 5; + } + + static bool is_only_numbers(const char *str, size_t size) { + for(size_t i = 0; i < size; ++i) { + if(str[i] < '0' || str[i] > '9') + return false; + } + return true; } NyaaSi::NyaaSi() : Plugin("nyaa.si") { @@ -60,77 +71,138 @@ namespace QuickMedia { return PluginResult::OK; } + // TODO: Also show the number of comments for each torrent. TODO: Optimize? + // TODO: Show each field as seperate columns instead of seperating by | SearchResult NyaaSi::content_list_search(const std::string &list_url, const std::string &text, BodyItems &result_items) { - std::string full_url = "https://nyaa.si/?page=rss&c=" + list_url + "&f=0&p=1&q="; + std::string full_url = "https://nyaa.si/?c=" + list_url + "&f=0&p=1&q="; full_url += url_param_encode(text); std::string website_data; - if(download_to_string(full_url, website_data, {}, use_tor) != DownloadResult::OK) + if(download_to_string(full_url, website_data, {}, use_tor, true) != DownloadResult::OK) return SearchResult::NET_ERR; - const std::string title_tag_begin = ""; - const std::string title_tag_end = ""; - const std::string link_tag_begin = ""; - const std::string link_tag_end = ""; - const std::string pub_date_tag_begin = ""; - const std::string pub_date_tag_end = ""; - const std::string seeders_tag_begin = ""; - const std::string seeders_tag_end = ""; - const std::string leechers_tag_begin = ""; - const std::string leechers_tag_end = ""; - const std::string downloads_tag_begin = ""; - const std::string downloads_tag_end = ""; - const std::string category_id_tag_begin = ""; - const std::string category_id_tag_end = ""; - const std::string size_tag_begin = ""; - const std::string size_tag_end = ""; - const std::string comments_tag_begin = ""; - const std::string comments_tag_end = ""; - const std::string trusted_tag_begin = ""; - const std::string trusted_tag_end = ""; - const std::string remake_tag_begin = ""; - const std::string remake_tag_end = ""; - - size_t index = 0; - while(index < website_data.size()) { - size_t item_start = website_data.find("", index); - if(item_start == std::string::npos) + size_t tbody_begin = website_data.find(""); + if(tbody_begin == std::string::npos) + return SearchResult::ERR; + + size_t tbody_end = website_data.find("", tbody_begin + 7); + if(tbody_end == std::string::npos) + return SearchResult::ERR; + + size_t index = tbody_begin + 7; + while(index < tbody_end) { + size_t tr_begin = website_data.find("= tbody_end) break; - index = item_start + 6; + size_t tr_end = website_data.find("", tr_begin + 3); + if(tr_end == std::string::npos || tr_end >= tbody_end) + return SearchResult::ERR; + + index = tr_begin + 3; + + bool is_trusted = false; + bool is_remake = false; + size_t tr_class_begin = website_data.find("class=\"", index); + if(tr_class_begin != std::string::npos && tr_class_begin < tr_end) { + size_t tr_class_end = website_data.find('"', tr_class_begin + 7); + size_t class_length = tr_class_end - (tr_class_begin + 7); + if(strncmp(website_data.c_str() + tr_class_begin + 7, "success", class_length) == 0) + is_trusted = true; + else if(strncmp(website_data.c_str() + tr_class_begin + 7, "danger", class_length) == 0) + is_remake = true; + index = tr_class_end + 1; + } + + size_t category_begin = website_data.find("/?c=", index); + if(category_begin == std::string::npos || category_begin >= tr_end) + return SearchResult::ERR; + + size_t category_end = website_data.find('"', category_begin + 4); + if(category_end == std::string::npos || category_end >= tr_end) + return SearchResult::ERR; - size_t item_end = website_data.find("", index); - if(item_end == std::string::npos) + index = category_end + 1; + size_t view_begin = website_data.find("/view/", index); + if(view_begin == std::string::npos || view_begin >= tr_end) return SearchResult::ERR; - std::string title = get_rss_item_text(website_data, index, item_end, title_tag_begin, title_tag_end); - std::string link = get_rss_item_text(website_data, index, item_end, link_tag_begin, link_tag_end); - std::string pub_date = get_rss_item_text(website_data, index, item_end, pub_date_tag_begin, pub_date_tag_end); - std::string seeders = get_rss_item_text(website_data, index, item_end, seeders_tag_begin, seeders_tag_end); - std::string leechers = get_rss_item_text(website_data, index, item_end, leechers_tag_begin, leechers_tag_end); - std::string downloads = get_rss_item_text(website_data, index, item_end, downloads_tag_begin, downloads_tag_end); - std::string category_id = get_rss_item_text(website_data, index, item_end, category_id_tag_begin, category_id_tag_end); - std::string size = get_rss_item_text(website_data, index, item_end, size_tag_begin, size_tag_end); - std::string comments = get_rss_item_text(website_data, index, item_end, comments_tag_begin, comments_tag_end); - std::string trusted = get_rss_item_text(website_data, index, item_end, trusted_tag_begin, trusted_tag_end); - std::string remake = get_rss_item_text(website_data, index, item_end, remake_tag_begin, remake_tag_end); - - if(title.empty() || link.empty() || pub_date.empty() || seeders.empty() || leechers.empty() || downloads.empty() || category_id.empty() || size.empty()) { - fprintf(stderr, "Error: failed to parse nyaa.si rss items\n"); + size_t view_end = website_data.find('"', view_begin + 6); + if(view_end == std::string::npos || view_end >= tr_end) return SearchResult::ERR; + + std::string view_url = website_data.substr(view_begin, view_end - view_begin); + + // Torrents with comments have two /view/, one for comments and one for the title + if(!is_only_numbers(website_data.c_str() + view_begin + 6, view_end - (view_begin + 6))) { + size_t view_begin2 = website_data.find("/view/", view_end + 1); + if(view_begin2 == std::string::npos || view_begin2 >= tr_end) + return SearchResult::ERR; + + size_t view_end2 = website_data.find('"', view_begin2 + 6); + if(view_end2 == std::string::npos || view_end2 >= tr_end) + return SearchResult::ERR; + + view_end = view_end2; } + size_t title_begin = website_data.find('>', view_end + 1); + if(title_begin == std::string::npos || title_begin >= tr_end) + return SearchResult::ERR; + size_t title_end = website_data.find("", title_begin + 1); + if(title_end == std::string::npos || title_end >= tr_end) + return SearchResult::ERR; + std::string title = website_data.substr(title_begin + 1, title_end - (title_begin + 1)); + html_unescape_sequences(title); + title = strip(title); + + index = title_end + 4; + size_t magnet_begin = website_data.find("magnet:?xt", index); + if(magnet_begin == std::string::npos || magnet_begin >= tr_end) + return SearchResult::ERR; + + size_t magnet_end = website_data.find('"', magnet_begin + 10); + if(magnet_end == std::string::npos || magnet_end >= tr_end) + return SearchResult::ERR; + + index = magnet_end + 1; + std::string size; + index = find_td_get_value(website_data, index, tr_end, size); + if(index == std::string::npos) + return SearchResult::ERR; + + std::string timestamp; + index = find_td_get_value(website_data, index, tr_end, timestamp); + if(index == std::string::npos) + return SearchResult::ERR; + + std::string seeders; + index = find_td_get_value(website_data, index, tr_end, seeders); + if(index == std::string::npos) + return SearchResult::ERR; + + std::string leechers; + index = find_td_get_value(website_data, index, tr_end, leechers); + if(index == std::string::npos) + return SearchResult::ERR; + + std::string completed; + index = find_td_get_value(website_data, index, tr_end, completed); + if(index == std::string::npos) + return SearchResult::ERR; + + index = tr_end + 5; + + std::string description = "Size: " + size + " | Published: " + timestamp + " | Seeders: " + seeders + " | Leechers: " + leechers + " | Completed: " + completed; auto body_item = BodyItem::create(std::move(title)); - body_item->url = std::move(link); - body_item->thumbnail_url = "https://nyaa.si/static/img/icons/nyaa/" + category_id + ".png"; - body_item->set_description("Published: " + pub_date + "\nSeeders: " + seeders + "\nLeechers: " + leechers + "\nDownloads: " + downloads + "\nSize: " + size + "\nComments: " + comments); - if(trusted == "Yes") - body_item->title_color = sf::Color(43, 200, 47); - else if(remake == "Yes") - body_item->title_color = sf::Color(200, 45, 47); + body_item->thumbnail_url = "https://nyaa.si/static/img/icons/nyaa/" + website_data.substr(category_begin + 4, category_end - (category_begin + 4)) + ".png"; + body_item->set_description(std::move(description)); + body_item->url = "https://nyaa.si" + std::move(view_url); + if(is_trusted) + body_item->title_color = sf::Color(43, 255, 47); + else if(is_remake) + body_item->title_color = sf::Color(255, 45, 47); result_items.push_back(std::move(body_item)); - - index = item_end + 7; } return SearchResult::OK; -- cgit v1.2.3