diff options
author | dec05eba <dec05eba@protonmail.com> | 2021-07-03 16:23:36 +0200 |
---|---|---|
committer | dec05eba <dec05eba@protonmail.com> | 2021-07-03 18:34:37 +0200 |
commit | 611d22bf269672ba56f98e12eb6b2a40efdaa5b9 (patch) | |
tree | 5dfd3e98fd08fa7cb6cb82c565b538cc891b6b98 | |
parent | 496f71413df2468a9d3329355ffef08280219808 (diff) |
Remove dependency on tidy, fix ph, support all 4chan markup
Go back to previous page when failing to fetch number of pages
-rw-r--r-- | README.md | 10 | ||||
-rw-r--r-- | TODO | 4 | ||||
m--------- | depends/html-parser | 0 | ||||
m--------- | depends/html-search | 0 | ||||
-rw-r--r-- | plugins/MediaGeneric.hpp | 3 | ||||
-rw-r--r-- | project.conf | 2 | ||||
-rw-r--r-- | src/DownloadUtils.cpp | 3 | ||||
-rw-r--r-- | src/NetUtils.cpp | 3 | ||||
-rw-r--r-- | src/QuickMedia.cpp | 26 | ||||
-rw-r--r-- | src/plugins/Fourchan.cpp | 389 | ||||
-rw-r--r-- | src/plugins/MangaGeneric.cpp | 84 | ||||
-rw-r--r-- | src/plugins/Manganelo.cpp | 83 | ||||
-rw-r--r-- | src/plugins/MediaGeneric.cpp | 47 | ||||
-rw-r--r-- | src/plugins/NyaaSi.cpp | 73 | ||||
-rw-r--r-- | src/plugins/Saucenao.cpp | 27 | ||||
-rw-r--r-- | src/plugins/Soundcloud.cpp | 11 | ||||
-rw-r--r-- | src/plugins/Youtube.cpp | 12 |
17 files changed, 393 insertions, 384 deletions
@@ -95,9 +95,9 @@ Type text and then wait and QuickMedia will automatically search.\ `Ctrl+Right`: Go to the next word.\ `Ctrl+V`: Paste clipboard content into the message input.\ `Ctrl+D`: Clear the input text.\ -`@`: Start searching for a user to mention.\ +`@`: Start searching for a user to mention. #### Server search page controls -`Ctrl+Enter`: Use the server specified in the search input.\ +`Ctrl+Enter`: Use the server specified in the search input. ### 4chan thread controls `Enter`: Show the posts that the selected post replies to and the posts the replied to the selected post.\ `Backspace`: Go back to the previously selected item after selecting it with `Enter`.\ @@ -109,11 +109,11 @@ Type text and then wait and QuickMedia will automatically search.\ `Ctrl+D`: Remove the selected file from the post.\ `Ctrl+I`: Reverse image search the selected image or select an url to open in the browser.\ `1 to 9`/`Numpad 1 to 9`: Select/deselect google captcha image when posting a comment on 4chan.\ -`Ctrl+S`: Save the image/video attached to the selected post.\ +`Ctrl+S`: Save the image/video attached to the selected post. ### File save controls `Tab`: Switch between navigating the file manager and file name.\ `Ctrl+Enter`/`Click on save`: Save the file.\ -`Esc`/`Click on cancel`: Cancel download.\ +`Esc`/`Click on cancel`: Cancel download. ## Matrix text commands `/upload`: Bring up the file manager and select a file to upload to the room, `Esc` to cancel.\ `/logout`: Logout.\ @@ -131,7 +131,7 @@ Note that at the moment, cached images will not be scaled with the dpi. Images d [tabbed](https://tools.suckless.org/tabbed/) can be used to put quickmedia windows into tabs. After installing `tabbed`, run `tabbed -c -k quickmedia launcher -e`. ## Dependencies ### Libraries -`sfml`, `libx11`, `libxrandr`, `jsoncpp`, `tidy`, `libglvnd` (opengl) +`sfml`, `libx11`, `libxrandr`, `jsoncpp`, `libglvnd` (opengl) ### Executables `curl`, `imagemagick` ### Fonts @@ -34,7 +34,6 @@ Cleanup keybindings. Some require ctrl, some dont. Set the icon of the window to be the icon of the plugin. Nice for KDE, GNOME, etc with titlebars. If --no-audio is used then music should be played with a lightweight music player instead. MPV is heavy even for music (60mb RAM). Maybe use sfml audio functions? Update 4chan thread in real time, just like 4chan-x. -Remove tidy dependency and use my own html-parser. Add option to sort by other than timestamp for nyaa.si. Add url preview for matrix (using matrix api, fallback to client url preview (using our own url preview project) if disabled by the homeserver). IMPORTANT: Cleanup old messages in matrix (from matrix plugin), and instead either save them to disk or refetch them from server when going up to read old messages. (High memory usage, high disk space) @@ -172,4 +171,5 @@ Allow ctrl+r for video when the video is loading. Youtube download gets stuck sometimes because of audio. Find a workaround for this. Dynamically change youtube video quality by modifying the itags (and other params?) if download is buffering or if the video is lagging. Use the new media proxy for downloading youtube videos as well. -PgUp/PgDown shouldn't move body by the number of visible items. It should instead move by the height of the body.
\ No newline at end of file +PgUp/PgDown shouldn't move body by the number of visible items. It should instead move by the height of the body. +Add option to view dead link in 4chan with 4chan archive and navigate to crossboard links.
\ No newline at end of file diff --git a/depends/html-parser b/depends/html-parser -Subproject 917f810d7f196fef5959bc3096ce7360df961fc +Subproject 11d3632fe4508bfd2f668b7b1c4d75a88cd6449 diff --git a/depends/html-search b/depends/html-search -Subproject 618dfeab47d4b3a7ddadf70b09fc607861ab271 +Subproject cc37a6af5283b4e4c052427fd0d2940ebce5fc8 diff --git a/plugins/MediaGeneric.hpp b/plugins/MediaGeneric.hpp index a9d1e2a..b1f9030 100644 --- a/plugins/MediaGeneric.hpp +++ b/plugins/MediaGeneric.hpp @@ -34,7 +34,7 @@ namespace QuickMedia { class MediaGenericSearchPage : public Page { public: - MediaGenericSearchPage(Program *program, const char *website_url, sf::Vector2i thumbnail_max_size); + MediaGenericSearchPage(Program *program, const char *website_url, sf::Vector2i thumbnail_max_size, bool cloudflare_bypass); const char* get_title() const override { return "Search"; } bool search_is_filter() override { return false; } SearchResult search(const std::string &str, BodyItems &result_items) override; @@ -68,6 +68,7 @@ namespace QuickMedia { std::vector<MediaTextQuery> related_media_text_queries; std::vector<MediaThumbnailQuery> related_media_thumbnail_queries; MediaRelatedCustomHandler related_custom_handler = nullptr; + bool cloudflare_bypass; }; class MediaGenericRelatedPage : public RelatedVideosPage { diff --git a/project.conf b/project.conf index 1a3d734..908d8a9 100644 --- a/project.conf +++ b/project.conf @@ -5,8 +5,6 @@ version = "1.0.0" platforms = ["posix"] [config] -# Ubuntu requires this -include_dirs = ["/usr/include/tidy"] # This needs to be commented out for now because rapidjson depends on undefined behavior according to gcc... #error_on_warning = "true" diff --git a/src/DownloadUtils.cpp b/src/DownloadUtils.cpp index 7660cee..dff0ecb 100644 --- a/src/DownloadUtils.cpp +++ b/src/DownloadUtils.cpp @@ -19,8 +19,6 @@ namespace QuickMedia { int total_downloaded_size = 0; }; - static const bool debug_download = false; - static int accumulate_string(char *data, int size, void *userdata) { std::string *str = (std::string*)userdata; if(str->size() + size > 1024 * 1024 * 100) // 100mb sane limit, TODO: make configurable @@ -75,6 +73,7 @@ namespace QuickMedia { return 0; } + static const bool debug_download = false; static const char *useragent_str = "user-agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36"; DownloadResult download_head_to_string(const std::string &url, std::string &result, bool use_browser_useragent, bool fail_on_error) { diff --git a/src/NetUtils.cpp b/src/NetUtils.cpp index d6b6cf2..cc19094 100644 --- a/src/NetUtils.cpp +++ b/src/NetUtils.cpp @@ -35,10 +35,11 @@ namespace QuickMedia { }; void html_unescape_sequences(std::string &str) { - const std::array<HtmlUnescapeSequence, 6> unescape_sequences = { + const std::array<HtmlUnescapeSequence, 7> unescape_sequences = { HtmlUnescapeSequence { """, "\"" }, HtmlUnescapeSequence { "'", "'" }, HtmlUnescapeSequence { "'", "'" }, + HtmlUnescapeSequence { " ", "\n" }, HtmlUnescapeSequence { "<", "<" }, HtmlUnescapeSequence { ">", ">" }, HtmlUnescapeSequence { "&", "&" } // This should be last, to not accidentally replace a new sequence caused by replacing this diff --git a/src/QuickMedia.cpp b/src/QuickMedia.cpp index 9788c0a..e21e591 100644 --- a/src/QuickMedia.cpp +++ b/src/QuickMedia.cpp @@ -1217,22 +1217,22 @@ namespace QuickMedia { } } else if(strcmp(plugin_name, "pornhub") == 0) { check_youtube_dl_installed(plugin_name); - auto search_page = std::make_unique<MediaGenericSearchPage>(this, "https://www.pornhub.com/", sf::Vector2i(320/1.5f, 180/1.5f)); + auto search_page = std::make_unique<MediaGenericSearchPage>(this, "https://www.pornhub.com/", sf::Vector2i(320/1.5f, 180/1.5f), false); add_pornhub_handlers(search_page.get()); tabs.push_back(Tab{create_body(false, true), std::move(search_page), create_search_bar("Search...", 500)}); } else if(strcmp(plugin_name, "spankbang") == 0) { check_youtube_dl_installed(plugin_name); - auto search_page = std::make_unique<MediaGenericSearchPage>(this, "https://spankbang.com/", sf::Vector2i(500/2.5f, 281/2.5f)); + auto search_page = std::make_unique<MediaGenericSearchPage>(this, "https://spankbang.com/", sf::Vector2i(500/2.5f, 281/2.5f), true); add_spankbang_handlers(search_page.get()); tabs.push_back(Tab{create_body(false, true), std::move(search_page), create_search_bar("Search...", 500)}); } else if(strcmp(plugin_name, "xvideos") == 0) { check_youtube_dl_installed(plugin_name); - auto search_page = std::make_unique<MediaGenericSearchPage>(this, "https://www.xvideos.com/", sf::Vector2i(352/1.5f, 198/1.5f)); + auto search_page = std::make_unique<MediaGenericSearchPage>(this, "https://www.xvideos.com/", sf::Vector2i(352/1.5f, 198/1.5f), false); add_xvideos_handlers(search_page.get()); tabs.push_back(Tab{create_body(false, true), std::move(search_page), create_search_bar("Search...", 500)}); } else if(strcmp(plugin_name, "xhamster") == 0) { check_youtube_dl_installed(plugin_name); - auto search_page = std::make_unique<MediaGenericSearchPage>(this, "https://xhamster.com/", sf::Vector2i(240, 135)); + auto search_page = std::make_unique<MediaGenericSearchPage>(this, "https://xhamster.com/", sf::Vector2i(240, 135), false); add_xhamster_handlers(search_page.get()); tabs.push_back(Tab{create_body(false, true), std::move(search_page), create_search_bar("Search...", 500)}); } else if(strcmp(plugin_name, "soundcloud") == 0) { @@ -1972,7 +1972,9 @@ namespace QuickMedia { BodyItem *selected_item = tabs[selected_tab].body->get_selected(); if(selected_item && tabs[selected_tab].page->is_trackable()) { TrackablePage *trackable_page = dynamic_cast<TrackablePage*>(tabs[selected_tab].page.get()); - trackable_page->track(selected_item->get_title()); + run_task_with_loading_screen([trackable_page, selected_item](){ + return trackable_page->track(selected_item->get_title()) == TrackResult::OK; + }); } } else if(event.key.code == sf::Keyboard::C && event.key.control) { BodyItem *selected_item = tabs[selected_tab].body->get_selected(); @@ -3183,6 +3185,7 @@ namespace QuickMedia { image_download_future.cancel(); image_download_cancel = false; + num_manga_pages = 0; std::promise<int> num_manga_pages_promise; num_manga_pages_future = num_manga_pages_promise.get_future(); @@ -3346,6 +3349,11 @@ namespace QuickMedia { sf::Event event; download_chapter_images_if_needed(images_page); + if(num_manga_pages == 0) { + current_page = pop_page_stack(); + return 0; + } + if(current_page != PageType::IMAGES || !window.isOpen()) return 0; @@ -3541,6 +3549,11 @@ namespace QuickMedia { } download_chapter_images_if_needed(images_page); + if(num_manga_pages == 0) { + current_page = pop_page_stack(); + return; + } + if(current_page != PageType::IMAGES_CONTINUOUS || !window.isOpen()) return; @@ -4281,6 +4294,9 @@ namespace QuickMedia { update_idle_state(); handle_window_close(); + if(current_page != PageType::CHAT_LOGIN) + break; + if(redraw) { redraw = false; get_body_dimensions(window_size, nullptr, body_pos, body_size); diff --git a/src/plugins/Fourchan.cpp b/src/plugins/Fourchan.cpp index 52024e1..4b2ca61 100644 --- a/src/plugins/Fourchan.cpp +++ b/src/plugins/Fourchan.cpp @@ -3,10 +3,9 @@ #include "../../include/Storage.hpp" #include "../../include/StringUtils.hpp" #include "../../include/NetUtils.hpp" +#include <HtmlParser.h> #include <json/reader.h> #include <string.h> -#include <tidy.h> -#include <tidybuffio.h> // API documentation: https://github.com/4chan/4chan-API @@ -37,133 +36,195 @@ namespace QuickMedia { struct CommentPiece { enum class Type { TEXT, - QUOTE, // > - QUOTELINK, // >>POSTNO, - LINE_CONTINUE + QUOTE, // >, Set for span + QUOTE_CONTINUE, // Set for span + QUOTELINK, // >>POSTNO, Set for a + DEADLINK, // Set for span + CROSSBOARD_LINK, // Set for a + CODEBLOCK // Set for pre }; - DataView text; // Set when type is TEXT, QUOTE or QUOTELINK + std::string text; int64_t quote_postnumber = 0; // Set when type is QUOTELINK Type type; }; - static TidyAttr get_attribute_by_name(TidyNode node, const char *name) { - for(TidyAttr attr = tidyAttrFirst(node); attr; attr = tidyAttrNext(attr)) { - const char *attr_name = tidyAttrName(attr); - if(attr_name && strcmp(name, attr_name) == 0) - return attr; - } - return nullptr; - } - - static const char* get_attribute_value(TidyNode node, const char *name) { - TidyAttr attr = get_attribute_by_name(node, name); - if(attr) - return tidyAttrValue(attr); - return nullptr; - } - - static void lstrip_newline(const char *str, size_t size, const char **output_str, size_t *output_size) { - size_t i = 0; - while(i < size && str[i] == '\n') { - ++i; - } - *output_str = str + i; - *output_size = size - i; - } + enum class NodeType { + A, + SPAN, + PRE + }; - static void rstrip_newline(const char *str, size_t size, size_t *output_size) { - ssize_t i = size - 1; - while(i >= 0 && str[i] == '\n') { - --i; + // Returns -1 if no match + static NodeType tag_name_to_node_type(HtmlStringView str) { + if(str.size == 1 && str.data[0] == 'a') { + return NodeType::A; + } else if(str.size == 4 && memcmp(str.data, "span", 4) == 0) { + return NodeType::SPAN; + } else if(str.size == 3 && memcmp(str.data, "pre", 3) == 0) { + return NodeType::PRE; + } else { + return (NodeType)-1; } - *output_size = i + 1; } - static void strip_newline(const char *str, size_t size, const char **output_str, size_t *output_size) { - lstrip_newline(str, size, output_str, output_size); - rstrip_newline(*output_str, *output_size, output_size); - } + struct HtmlNode { + NodeType node_type; + std::string klass; + std::string href; + int output_count = 0; + }; using CommentPieceCallback = std::function<void(const CommentPiece&)>; - static int extract_comment_pieces(TidyDoc doc, TidyNode node, CommentPieceCallback &callback) { - for(TidyNode child = tidyGetChild(node); child; child = tidyGetNext(child)) { - const char *node_name = tidyNodeGetName(child); - if(node_name) { - if(strcmp(node_name, "br") == 0) { + struct HtmlParseUserdata { + CommentPieceCallback callback; + std::stack<HtmlNode> html_node; + }; + + static int html_parse_callback(HtmlParser *html_parser, HtmlParseType parse_type, void *userdata) { + HtmlParseUserdata *parse_userdata = (HtmlParseUserdata*)userdata; + + switch(parse_type) { + case HTML_PARSE_TAG_START: { + if(html_parser->tag_name.size == 2 && memcmp(html_parser->tag_name.data, "br", 2) == 0) { CommentPiece comment_piece; comment_piece.type = CommentPiece::Type::TEXT; - // Warning: Cast from const char* to char* ... - comment_piece.text = { (char*)"\n", 1 }; - callback(comment_piece); - /*} else if(strcmp(node_name, "span") == 0) { - const char *span_class = get_attribute_value(child, "class"); - //fprintf(stderr, "span class: %s\n", span_class); - if(span_class && strcmp(span_class, "quote") == 0) { - CommentPiece comment_piece; - comment_piece.type = CommentPiece::Type::QUOTE; - // Warning: Cast from const char* to char* ... - comment_piece.text = { (char*)"\n", 1 }; - callback(comment_piece); - }*/ + comment_piece.text = "\n"; + parse_userdata->callback(comment_piece); } - } - if(tidyNodeGetType(child) == TidyNode_Text) { - TidyBuffer tidy_buffer; - tidyBufInit(&tidy_buffer); - if(tidyNodeGetText(doc, child, &tidy_buffer)) { - const char *inner_text = (const char*)tidy_buffer.bp; - size_t inner_text_size = tidy_buffer.size; - strip_newline(inner_text, inner_text_size, &inner_text, &inner_text_size); - - const char *node_name = tidyNodeGetName(node); - if(node_name && strcmp(node_name, "a") == 0) { - const char *a_class = get_attribute_value(node, "class"); - const char *a_href = get_attribute_value(node, "href"); - if(a_class && a_href && strcmp(a_class, "quotelink") == 0 && strncmp(a_href, "#p", 2) == 0) { - CommentPiece comment_piece; - comment_piece.type = CommentPiece::Type::QUOTELINK; - comment_piece.quote_postnumber = strtoll(a_href + 2, nullptr, 10); - // Warning: Cast from const char* to char* ... - comment_piece.text = { (char*)inner_text, inner_text_size }; - callback(comment_piece); - tidyBufFree(&tidy_buffer); - continue; - } + const NodeType node_type = tag_name_to_node_type(html_parser->tag_name); + if(node_type != (NodeType)-1) + parse_userdata->html_node.push({ node_type, "", "", 0 }); + break; + } + case HTML_PARSE_TAG_END: { + if(!parse_userdata->html_node.empty()) { + const NodeType node_type = tag_name_to_node_type(html_parser->tag_name); + if(node_type != (NodeType)-1) + parse_userdata->html_node.pop(); + } + break; + } + case HTML_PARSE_ATTRIBUTE: { + if(!parse_userdata->html_node.empty()) { + HtmlNode &html_node = parse_userdata->html_node.top(); + if(html_parser->attribute_key.size == 5 && memcmp(html_parser->attribute_key.data, "class", 5) == 0) { + html_node.klass.assign(html_parser->attribute_value.data, html_parser->attribute_value.size); + } else if(html_parser->attribute_key.size == 4 && memcmp(html_parser->attribute_key.data, "href", 4) == 0) { + html_node.href.assign(html_parser->attribute_value.data, html_parser->attribute_value.size); } + } + break; + } + case HTML_PARSE_TEXT: { + std::string text(html_parser->text.data, html_parser->text.size); + html_unescape_sequences(text); - CommentPiece comment_piece; + CommentPiece comment_piece; + comment_piece.type = CommentPiece::Type::TEXT; + comment_piece.text = std::move(text); + + if(parse_userdata->html_node.empty()) { comment_piece.type = CommentPiece::Type::TEXT; - // Warning: Cast from const char* to char* ... - comment_piece.text = { (char*)inner_text, inner_text_size }; - callback(comment_piece); + } else { + HtmlNode &html_node = parse_userdata->html_node.top(); + switch(html_node.node_type) { + case NodeType::A: { + if(html_node.klass == "quotelink") { + if(string_starts_with(html_node.href, "#p")) { + comment_piece.type = CommentPiece::Type::QUOTELINK; + comment_piece.quote_postnumber = strtoll(html_node.href.c_str() + 2, nullptr, 10); + } else if(string_starts_with(html_node.href, "/")) { + comment_piece.type = CommentPiece::Type::CROSSBOARD_LINK; + } else { + fprintf(stderr, "Unexpected href for quotelink: %s\n", html_node.href.c_str()); + } + } else { + fprintf(stderr, "Unexpected class for a: %s\n", html_node.klass.c_str()); + } + break; + } + case NodeType::SPAN: { + if(html_node.klass == "quote") { + comment_piece.type = html_node.output_count ? CommentPiece::Type::QUOTE : CommentPiece::Type::QUOTE_CONTINUE; + } else if(html_node.klass == "deadlink") { + comment_piece.type = CommentPiece::Type::DEADLINK; + } else { + fprintf(stderr, "Unexpected class for span: %s\n", html_node.klass.c_str()); + } + break; + } + case NodeType::PRE: { + if(html_node.klass == "prettyprint") { + comment_piece.type = CommentPiece::Type::CODEBLOCK; + } else { + fprintf(stderr, "Unexpected class for pre: %s\n", html_node.klass.c_str()); + } + break; + } + } + html_node.output_count++; } - tidyBufFree(&tidy_buffer); - } else { - int res = extract_comment_pieces(doc, child, callback); - if(res != 0) - return res; + + parse_userdata->callback(comment_piece); + break; + } + case HTML_PARSE_JAVASCRIPT_CODE: { + break; } } + return 0; } static void extract_comment_pieces(const char *html_source, size_t size, CommentPieceCallback callback) { - TidyDoc doc = tidyCreate(); - tidyOptSetBool(doc, TidyShowWarnings, no); - tidyOptSetInt(doc, TidyUseCustomTags, 1); - tidyOptSetInt(doc, TidyWrapLen, 0); - if(tidyParseString(doc, html_source) < 0) { - CommentPiece comment_piece; - comment_piece.type = CommentPiece::Type::TEXT; - // Warning: Cast from const char* to char* ... - comment_piece.text = { (char*)html_source, size }; - callback(comment_piece); - } else { - extract_comment_pieces(doc, tidyGetRoot(doc), callback); - } - tidyRelease(doc); + HtmlParseUserdata parse_userdata; + parse_userdata.callback = std::move(callback); + html_parser_parse(html_source, size, html_parse_callback, &parse_userdata); + } + + static std::string html_to_text(const char *html_source, size_t size, std::unordered_map<int64_t, size_t> &comment_by_postno, BodyItems &result_items, size_t body_item_index) { + std::string comment_text; + extract_comment_pieces(html_source, size, + [&comment_text, &comment_by_postno, &result_items, body_item_index](const CommentPiece &cp) { + switch(cp.type) { + case CommentPiece::Type::TEXT: + comment_text += std::move(cp.text); + break; + case CommentPiece::Type::QUOTE: + comment_text += std::move(cp.text); + break; + case CommentPiece::Type::QUOTE_CONTINUE: + comment_text += std::move(cp.text); + break; + case CommentPiece::Type::QUOTELINK: { + comment_text += std::move(cp.text); + auto it = comment_by_postno.find(cp.quote_postnumber); + if(it == comment_by_postno.end()) { + // TODO: Link this quote to a 4chan archive that still has the quoted comment (if available) + comment_text += " (Dead)"; + } else { + result_items[body_item_index]->replies_to.push_back(it->second); + result_items[it->second]->replies.push_back(body_item_index); + } + break; + } + case CommentPiece::Type::DEADLINK: + // TODO: Link this quote to a 4chan archive that still has the quoted comment (if available) + comment_text += std::move(cp.text) + " (Dead)"; + break; + case CommentPiece::Type::CROSSBOARD_LINK: + // TODO: Link this to another thread and allow navigating to it + comment_text += std::move(cp.text) + " (Cross-thread)"; + break; + case CommentPiece::Type::CODEBLOCK: + // TODO: Use a different colored background and use a monospace font + comment_text += std::move(cp.text); + break; + } + }); + return comment_text; } PluginResult FourchanBoardsPage::submit(const std::string &title, const std::string &url, std::vector<Tab> &result_tabs) { @@ -264,68 +325,14 @@ namespace QuickMedia { author_str += " #" + std::to_string(post_num.asInt64()); - std::string comment_text; - extract_comment_pieces(sub_begin, sub_end - sub_begin, - [&comment_text](const CommentPiece &cp) { - switch(cp.type) { - case CommentPiece::Type::TEXT: - comment_text.append(cp.text.data, cp.text.size); - break; - case CommentPiece::Type::QUOTE: - //comment_text += '>'; - //comment_text.append(cp.text.data, cp.text.size); - //comment_text += '\n'; - break; - case CommentPiece::Type::QUOTELINK: { - comment_text.append(cp.text.data, cp.text.size); - break; - } - case CommentPiece::Type::LINE_CONTINUE: { - if(!comment_text.empty() && comment_text.back() == '\n') { - comment_text.pop_back(); - } - break; - } - } - } - ); + std::string comment_text = html_to_text(sub_begin, sub_end - sub_begin, comment_by_postno, result_items, body_item_index); if(!comment_text.empty()) comment_text += '\n'; - extract_comment_pieces(comment_begin, comment_end - comment_begin, - [&comment_text, &comment_by_postno, &result_items, body_item_index](const CommentPiece &cp) { - switch(cp.type) { - case CommentPiece::Type::TEXT: - comment_text.append(cp.text.data, cp.text.size); - break; - case CommentPiece::Type::QUOTE: - //comment_text += '>'; - //comment_text.append(cp.text.data, cp.text.size); - //comment_text += '\n'; - break; - case CommentPiece::Type::QUOTELINK: { - comment_text.append(cp.text.data, cp.text.size); - auto it = comment_by_postno.find(cp.quote_postnumber); - if(it == comment_by_postno.end()) { - // TODO: Link this quote to a 4chan archive that still has the quoted comment (if available) - comment_text += "(dead)"; - } else { - result_items[body_item_index]->replies_to.push_back(it->second); - result_items[it->second]->replies.push_back(body_item_index); - } - break; - } - case CommentPiece::Type::LINE_CONTINUE: { - if(!comment_text.empty() && comment_text.back() == '\n') { - comment_text.pop_back(); - } - break; - } - } - } - ); + + comment_text += html_to_text(comment_begin, comment_end - comment_begin, comment_by_postno, result_items, body_item_index); if(!comment_text.empty() && comment_text.back() == '\n') - comment_text.back() = ' '; - html_unescape_sequences(comment_text); + comment_text.pop_back(); + BodyItem *body_item = result_items[body_item_index].get(); body_item->set_title(std::move(comment_text)); body_item->set_author(std::move(author_str)); @@ -369,6 +376,7 @@ namespace QuickMedia { if(!json_root.isArray()) return PluginResult::ERR; + std::unordered_map<int64_t, size_t> comment_by_postno; for(const Json::Value &page_data : json_root) { if(!page_data.isObject()) continue; @@ -395,61 +403,11 @@ namespace QuickMedia { if(!thread_num.isNumeric()) continue; - std::string title_text; - extract_comment_pieces(sub_begin, sub_end - sub_begin, - [&title_text](const CommentPiece &cp) { - switch(cp.type) { - case CommentPiece::Type::TEXT: - title_text.append(cp.text.data, cp.text.size); - break; - case CommentPiece::Type::QUOTE: - //title_text += '>'; - //title_text.append(cp.text.data, cp.text.size); - //comment_text += '\n'; - break; - case CommentPiece::Type::QUOTELINK: { - title_text.append(cp.text.data, cp.text.size); - break; - } - case CommentPiece::Type::LINE_CONTINUE: { - if(!title_text.empty() && title_text.back() == '\n') { - title_text.pop_back(); - } - break; - } - } - } - ); + std::string title_text = html_to_text(sub_begin, sub_end - sub_begin, comment_by_postno, result_items, 0); if(!title_text.empty() && title_text.back() == '\n') title_text.back() = ' '; - html_unescape_sequences(title_text); - - std::string comment_text; - extract_comment_pieces(comment_begin, comment_end - comment_begin, - [&comment_text](const CommentPiece &cp) { - switch(cp.type) { - case CommentPiece::Type::TEXT: - comment_text.append(cp.text.data, cp.text.size); - break; - case CommentPiece::Type::QUOTE: - //comment_text += '>'; - //comment_text.append(cp.text.data, cp.text.size); - //comment_text += '\n'; - break; - case CommentPiece::Type::QUOTELINK: { - comment_text.append(cp.text.data, cp.text.size); - break; - } - case CommentPiece::Type::LINE_CONTINUE: { - if(!comment_text.empty() && comment_text.back() == '\n') { - comment_text.pop_back(); - } - break; - } - } - } - ); - html_unescape_sequences(comment_text); + + std::string comment_text = html_to_text(comment_begin, comment_end - comment_begin, comment_by_postno, result_items, 0); // TODO: Do the same when wrapping is implemented // TODO: Remove this int num_lines = 0; @@ -462,6 +420,7 @@ namespace QuickMedia { } } } + auto body_item = BodyItem::create(std::move(comment_text)); body_item->set_author(std::move(title_text)); body_item->url = std::to_string(thread_num.asInt64()); diff --git a/src/plugins/MangaGeneric.cpp b/src/plugins/MangaGeneric.cpp index 2d0df6a..a2608ab 100644 --- a/src/plugins/MangaGeneric.cpp +++ b/src/plugins/MangaGeneric.cpp @@ -43,7 +43,11 @@ namespace QuickMedia { const char *field_contains = nullptr; }; - static const char* html_attr_or_inner_text(QuickMediaHtmlNode *node, const char *field_name) { + static bool string_view_contains(const QuickMediaStringView str, const char *sub) { + return memmem(str.data, str.size, sub, strlen(sub)); + } + + static QuickMediaStringView html_attr_or_inner_text(QuickMediaMatchNode *node, const char *field_name) { if(strcmp(field_name, "text") == 0) return quickmedia_html_node_get_text(node); else @@ -66,59 +70,62 @@ namespace QuickMedia { static int html_append_search(QuickMediaHtmlSearch *html_search, const char *html_query, HtmlSearchUserdata *search_userdata) { return quickmedia_html_find_nodes_xpath(html_search, html_query, - [](QuickMediaHtmlNode *node, void *userdata) { + [](QuickMediaMatchNode *node, void *userdata) { HtmlSearchUserdata *search_userdata = (HtmlSearchUserdata*)userdata; - const char *field1_value = html_attr_or_inner_text(node, search_userdata->field1); + QuickMediaStringView field1_value = html_attr_or_inner_text(node, search_userdata->field1); if(search_userdata->field2) { - const char *field2_value = html_attr_or_inner_text(node, search_userdata->field2); - if(field1_value && field2_value && (!search_userdata->field2_contains || strstr(field2_value, search_userdata->field2_contains))) { - std::string field1_fixed = strip(field1_value); + QuickMediaStringView field2_value = html_attr_or_inner_text(node, search_userdata->field2); + if(field1_value.data && field2_value.data && (!search_userdata->field2_contains || string_view_contains(field2_value, search_userdata->field2_contains))) { + std::string field1_fixed(field1_value.data, field1_value.size); html_unescape_sequences(field1_fixed); auto item = BodyItem::create(std::move(field1_fixed)); - item->url = strip(field2_value); + item->url = std::string(field2_value.data, field2_value.size); search_userdata->body_items->push_back(std::move(item)); } } else { - if(field1_value) { - std::string field1_fixed = strip(field1_value); + if(field1_value.data) { + std::string field1_fixed(field1_value.data, field1_value.size); html_unescape_sequences(field1_fixed); auto item = BodyItem::create(std::move(field1_fixed)); search_userdata->body_items->push_back(std::move(item)); } } + return 0; }, search_userdata); } static int html_body_item_merge(QuickMediaHtmlSearch *html_search, const char *html_query, HtmlMergeUserdata *merge_userdata) { return quickmedia_html_find_nodes_xpath(html_search, html_query, - [](QuickMediaHtmlNode *node, void *userdata) { + [](QuickMediaMatchNode *node, void *userdata) { HtmlMergeUserdata *merge_userdata = (HtmlMergeUserdata*)userdata; BodyItemContext &body_item_image_context = merge_userdata->body_item_image_context; - const char *field_value = html_attr_or_inner_text(node, merge_userdata->field_name); + QuickMediaStringView field_value = html_attr_or_inner_text(node, merge_userdata->field_name); if(body_item_image_context.index < body_item_image_context.body_items->size() - && field_value && (!merge_userdata->field_contains || strstr(field_value, merge_userdata->field_contains))) + && field_value.data && (!merge_userdata->field_contains || string_view_contains(field_value, merge_userdata->field_contains))) { + std::string field_stripped(field_value.data, field_value.size); if(merge_userdata->type == MergeType::THUMBNAIL) { - (*body_item_image_context.body_items)[body_item_image_context.index]->thumbnail_url = strip(field_value); + (*body_item_image_context.body_items)[body_item_image_context.index]->thumbnail_url = std::move(field_stripped); } else if(merge_userdata->type == MergeType::DESCRIPTION) { - std::string field_stripped = strip(field_value); const char *prefix = merge_userdata->desc_prefix ? merge_userdata->desc_prefix : ""; - (*body_item_image_context.body_items)[body_item_image_context.index]->set_description(prefix + field_stripped); + (*body_item_image_context.body_items)[body_item_image_context.index]->set_description(prefix + std::move(field_stripped)); (*body_item_image_context.body_items)[body_item_image_context.index]->set_description_color(get_current_theme().faded_text_color); } body_item_image_context.index++; } + return 0; }, merge_userdata); } static int html_get_page_url(QuickMediaHtmlSearch *html_search, const char *html_query, HtmlPageImageUserdata *page_image_userdata) { return quickmedia_html_find_nodes_xpath(html_search, html_query, - [](QuickMediaHtmlNode *node, void *userdata) { + [](QuickMediaMatchNode *node, void *userdata) { HtmlPageImageUserdata *page_image_userdata = (HtmlPageImageUserdata*)userdata; - const char *field1_value = html_attr_or_inner_text(node, page_image_userdata->field_name); - if(page_image_userdata->url->empty() && field1_value && (!page_image_userdata->field_contains || strstr(field1_value, page_image_userdata->field_contains))) { - *page_image_userdata->url = strip(field1_value); + QuickMediaStringView field1_value = html_attr_or_inner_text(node, page_image_userdata->field_name); + if(page_image_userdata->url->empty() && field1_value.data && (!page_image_userdata->field_contains || string_view_contains(field1_value, page_image_userdata->field_contains))) { + *page_image_userdata->url = std::string(field1_value.data, field1_value.size); } + return 0; }, page_image_userdata); } @@ -170,7 +177,7 @@ namespace QuickMedia { return PluginResult::OK; QuickMediaHtmlSearch html_search; - int result = quickmedia_html_search_init(&html_search, website_data.c_str()); + int result = quickmedia_html_search_init(&html_search, website_data.c_str(), website_data.size()); if(result != 0) goto cleanup; @@ -306,7 +313,7 @@ namespace QuickMedia { return PluginResult::NET_ERR; QuickMediaHtmlSearch html_search; - int result = quickmedia_html_search_init(&html_search, website_data.c_str()); + int result = quickmedia_html_search_init(&html_search, website_data.c_str(), website_data.size()); if(result != 0) goto cleanup; @@ -338,12 +345,13 @@ namespace QuickMedia { authors_userdata.authors_query = &authors_query; quickmedia_html_find_nodes_xpath(&html_search, authors_query.html_query, - [](QuickMediaHtmlNode *node, void *userdata) { + [](QuickMediaMatchNode *node, void *userdata) { HtmlAuthorsUserdata *authors_userdata = (HtmlAuthorsUserdata*)userdata; - const char *title_value = html_attr_or_inner_text(node, authors_userdata->authors_query->title_field); - const char *url_value = html_attr_or_inner_text(node, authors_userdata->authors_query->url_field); - if(title_value && url_value && (!authors_userdata->authors_query->url_contains || strstr(url_value, authors_userdata->authors_query->url_contains))) - (*authors_userdata->creators)[strip(title_value)] = strip(url_value); + QuickMediaStringView title_value = html_attr_or_inner_text(node, authors_userdata->authors_query->title_field); + QuickMediaStringView url_value = html_attr_or_inner_text(node, authors_userdata->authors_query->url_field); + if(title_value.data && url_value.data && (!authors_userdata->authors_query->url_contains || string_view_contains(url_value, authors_userdata->authors_query->url_contains))) + (*authors_userdata->creators)[std::string(title_value.data, title_value.size)] = std::string(url_value.data, url_value.size); + return 0; }, &authors_userdata); } } @@ -471,19 +479,20 @@ namespace QuickMedia { return ImageResult::NET_ERR; QuickMediaHtmlSearch html_search; - int result = quickmedia_html_search_init(&html_search, website_data.c_str()); + int result = quickmedia_html_search_init(&html_search, website_data.c_str(), website_data.size()); if(result != 0) goto cleanup; result = quickmedia_html_find_nodes_xpath(&html_search, list_page_pagination_query->pages_html_query, - [](QuickMediaHtmlNode *node, void *userdata) { + [](QuickMediaMatchNode *node, void *userdata) { HtmlPageCountUserdata *page_count_userdata = (HtmlPageCountUserdata*)userdata; - const char *field1_value = html_attr_or_inner_text(node, page_count_userdata->field_name); - if(field1_value) { - std::string field_value_stripped = strip(field1_value); + QuickMediaStringView field1_value = html_attr_or_inner_text(node, page_count_userdata->field_name); + if(field1_value.data) { + std::string field_value_stripped(field1_value.data, field1_value.size); if(is_number(field_value_stripped.c_str())) page_count_userdata->num_pages = strtol(field_value_stripped.c_str(), nullptr, 10); } + return 0; }, &page_count_userdata); if(result != 0 || page_count_userdata.num_pages == 0) { @@ -583,7 +592,7 @@ namespace QuickMedia { return ImageResult::ERR; QuickMediaHtmlSearch html_search; - int result = quickmedia_html_search_init(&html_search, website_data.c_str()); + int result = quickmedia_html_search_init(&html_search, website_data.c_str(), website_data.size()); if(result != 0) goto cleanup; @@ -642,17 +651,18 @@ namespace QuickMedia { list_page_images_userdata.field_contains = list_page_images_query->field_contains; QuickMediaHtmlSearch html_search; - int result = quickmedia_html_search_init(&html_search, website_data.c_str()); + int result = quickmedia_html_search_init(&html_search, website_data.c_str(), website_data.size()); if(result != 0) goto cleanup; result = quickmedia_html_find_nodes_xpath(&html_search, list_page_images_query->html_query, - [](QuickMediaHtmlNode *node, void *userdata) { + [](QuickMediaMatchNode *node, void *userdata) { HtmlListPageImagesUserdata *list_page_images_userdata = (HtmlListPageImagesUserdata*)userdata; - const char *field1_value = html_attr_or_inner_text(node, list_page_images_userdata->field_name); - if(field1_value && (!list_page_images_userdata->field_contains || strstr(field1_value, list_page_images_userdata->field_contains))) { - list_page_images_userdata->urls->push_back(strip(field1_value)); + QuickMediaStringView field1_value = html_attr_or_inner_text(node, list_page_images_userdata->field_name); + if(field1_value.data && (!list_page_images_userdata->field_contains || string_view_contains(field1_value, list_page_images_userdata->field_contains))) { + list_page_images_userdata->urls->push_back(std::string(field1_value.data, field1_value.size)); } + return 0; }, &list_page_images_userdata); if(result == 0 && !chapter_image_urls.empty() && list_page_images_query->post_handler) diff --git a/src/plugins/Manganelo.cpp b/src/plugins/Manganelo.cpp index 094d096..e0517dd 100644 --- a/src/plugins/Manganelo.cpp +++ b/src/plugins/Manganelo.cpp @@ -1,11 +1,14 @@ #include "../../plugins/Manganelo.hpp" #include "../../include/Notification.hpp" -#include "../../include/StringUtils.hpp" #include "../../include/NetUtils.hpp" #include "../../include/Theme.hpp" #include <quickmedia/HtmlSearch.h> namespace QuickMedia { + static bool string_view_contains(const QuickMediaStringView str, const char *sub) { + return memmem(str.data, str.size, sub, strlen(sub)); + } + // Returns true if modified static bool remove_html_span(std::string &str) { size_t open_tag_start = str.find("<span"); @@ -35,20 +38,21 @@ namespace QuickMedia { return PluginResult::NET_ERR; QuickMediaHtmlSearch html_search; - int result = quickmedia_html_search_init(&html_search, website_data.c_str()); + int result = quickmedia_html_search_init(&html_search, website_data.c_str(), website_data.size()); if(result != 0) goto cleanup; result = quickmedia_html_find_nodes_xpath(&html_search, "//ul[class='row-content-chapter']//a", - [](QuickMediaHtmlNode *node, void *userdata) { + [](QuickMediaMatchNode *node, void *userdata) { auto *item_data = (BodyItems*)userdata; - const char *href = quickmedia_html_node_get_attribute_value(node, "href"); - const char *text = quickmedia_html_node_get_text(node); - if(href && text) { - auto item = BodyItem::create(strip(text)); - item->url = strip(href); + QuickMediaStringView href = quickmedia_html_node_get_attribute_value(node, "href"); + QuickMediaStringView text = quickmedia_html_node_get_text(node); + if(href.data && text.data) { + auto item = BodyItem::create(std::string(text.data, text.size)); + item->url.assign(href.data, href.size); item_data->push_back(std::move(item)); } + return 0; }, &chapters_items); BodyItemContext body_item_context; @@ -56,29 +60,31 @@ namespace QuickMedia { body_item_context.index = 0; quickmedia_html_find_nodes_xpath(&html_search, "//ul[class='row-content-chapter']//span", - [](QuickMediaHtmlNode *node, void *userdata) { + [](QuickMediaMatchNode *node, void *userdata) { auto *item_data = (BodyItemContext*)userdata; - const char *class_attr = quickmedia_html_node_get_attribute_value(node, "class"); - const char *text = quickmedia_html_node_get_text(node); - if(text && class_attr && strstr(class_attr, "chapter-time") && item_data->index < item_data->body_items->size()) { - std::string uploaded_date = strip(text); - (*item_data->body_items)[item_data->index]->set_description("Uploaded: " + uploaded_date); + QuickMediaStringView class_attr = quickmedia_html_node_get_attribute_value(node, "class"); + QuickMediaStringView text = quickmedia_html_node_get_text(node); + if(text.data && class_attr.data && string_view_contains(class_attr, "chapter-time") && item_data->index < item_data->body_items->size()) { + std::string uploaded_date(text.data, text.size); + (*item_data->body_items)[item_data->index]->set_description("Uploaded: " + std::move(uploaded_date)); (*item_data->body_items)[item_data->index]->set_description_color(get_current_theme().faded_text_color); item_data->index++; } + return 0; }, &body_item_context); quickmedia_html_find_nodes_xpath(&html_search, "//a[class='a-h']", - [](QuickMediaHtmlNode *node, void *userdata) { + [](QuickMediaMatchNode *node, void *userdata) { std::vector<Creator> *creators = (std::vector<Creator>*)userdata; - const char *href = quickmedia_html_node_get_attribute_value(node, "href"); - const char *text = quickmedia_html_node_get_text(node); - if(href && text && strstr(href, "/author/story/")) { + QuickMediaStringView href = quickmedia_html_node_get_attribute_value(node, "href"); + QuickMediaStringView text = quickmedia_html_node_get_text(node); + if(href.data && text.data && string_view_contains(href, "/author/story/")) { Creator creator; - creator.name = strip(text); - creator.url = strip(href); + creator.name.assign(text.data, text.size); + creator.url.assign(href.data, href.size); creators->push_back(std::move(creator)); } + return 0; }, &creators); cleanup: @@ -124,7 +130,7 @@ namespace QuickMedia { if(name.isString() && name.asCString()[0] != '\0' && nameunsigned.isString() && nameunsigned.asCString()[0] != '\0') { std::string name_str = name.asString(); while(remove_html_span(name_str)) {} - auto item = BodyItem::create(strip(name_str)); + auto item = BodyItem::create(name_str); item->url = "https://manganelo.com/manga/" + url_param_encode(nameunsigned.asString()); if(lastchapter.isString() && lastchapter.asCString()[0] != '\0') { item->set_description("Latest chapter: " + lastchapter.asString()); @@ -192,20 +198,21 @@ namespace QuickMedia { return PluginResult::NET_ERR; QuickMediaHtmlSearch html_search; - int result = quickmedia_html_search_init(&html_search, website_data.c_str()); + int result = quickmedia_html_search_init(&html_search, website_data.c_str(), website_data.size()); if(result != 0) goto cleanup; result = quickmedia_html_find_nodes_xpath(&html_search, "//div[class='search-story-item']//a[class='item-img']", - [](QuickMediaHtmlNode *node, void *userdata) { + [](QuickMediaMatchNode *node, void *userdata) { auto *item_data = (BodyItems*)userdata; - const char *href = quickmedia_html_node_get_attribute_value(node, "href"); - const char *title = quickmedia_html_node_get_attribute_value(node, "title"); - if(href && title && strstr(href, "/manga/")) { - auto body_item = BodyItem::create(strip(title)); - body_item->url = strip(href); + QuickMediaStringView href = quickmedia_html_node_get_attribute_value(node, "href"); + QuickMediaStringView title = quickmedia_html_node_get_attribute_value(node, "title"); + if(href.data && title.data && string_view_contains(href, "/manga/")) { + auto body_item = BodyItem::create(std::string(title.data, title.size)); + body_item->url.assign(href.data, href.size); item_data->push_back(std::move(body_item)); } + return 0; }, &result_items); if(result != 0) @@ -216,13 +223,14 @@ namespace QuickMedia { body_item_image_context.index = 0; result = quickmedia_html_find_nodes_xpath(&html_search, "//div[class='search-story-item']//a[class='item-img']//img", - [](QuickMediaHtmlNode *node, void *userdata) { + [](QuickMediaMatchNode *node, void *userdata) { auto *item_data = (BodyItemContext*)userdata; - const char *src = quickmedia_html_node_get_attribute_value(node, "src"); - if(src && item_data->index < item_data->body_items->size()) { - (*item_data->body_items)[item_data->index]->thumbnail_url = src; + QuickMediaStringView src = quickmedia_html_node_get_attribute_value(node, "src"); + if(src.data && item_data->index < item_data->body_items->size()) { + (*item_data->body_items)[item_data->index]->thumbnail_url.assign(src.data, src.size); item_data->index++; } + return 0; }, &body_item_image_context); cleanup: @@ -261,18 +269,19 @@ namespace QuickMedia { return ImageResult::NET_ERR; QuickMediaHtmlSearch html_search; - int result = quickmedia_html_search_init(&html_search, website_data.c_str()); + int result = quickmedia_html_search_init(&html_search, website_data.c_str(), website_data.size()); if(result != 0) goto cleanup; result = quickmedia_html_find_nodes_xpath(&html_search, "//div[class='container-chapter-reader']/img", - [](QuickMediaHtmlNode *node, void *userdata) { + [](QuickMediaMatchNode *node, void *userdata) { auto *urls = (std::vector<std::string>*)userdata; - const char *src = quickmedia_html_node_get_attribute_value(node, "src"); - if(src) { - std::string image_url = strip(src); + QuickMediaStringView src = quickmedia_html_node_get_attribute_value(node, "src"); + if(src.data) { + std::string image_url(src.data, src.size); urls->push_back(std::move(image_url)); } + return 0; }, &chapter_image_urls); cleanup: diff --git a/src/plugins/MediaGeneric.cpp b/src/plugins/MediaGeneric.cpp index 1f2389a..c829a33 100644 --- a/src/plugins/MediaGeneric.cpp +++ b/src/plugins/MediaGeneric.cpp @@ -3,15 +3,20 @@ #include <quickmedia/HtmlSearch.h> namespace QuickMedia { - using HtmlPathCallback = std::function<void(QuickMediaHtmlNode*)>; + static bool string_view_contains(const QuickMediaStringView str, const char *sub) { + return memmem(str.data, str.size, sub, strlen(sub)); + } + + using HtmlPathCallback = std::function<void(QuickMediaMatchNode*)>; static int quickmedia_html_find_nodes_xpath(QuickMediaHtmlSearch *self, const char *xpath, HtmlPathCallback callback) { - return quickmedia_html_find_nodes_xpath(self, xpath, [](QuickMediaHtmlNode *node, void *userdata) { + return quickmedia_html_find_nodes_xpath(self, xpath, [](QuickMediaMatchNode *node, void *userdata) { HtmlPathCallback *callback = (HtmlPathCallback*)userdata; (*callback)(node); + return 0; }, &callback); } - static const char* html_attr_or_inner_text(QuickMediaHtmlNode *node, const char *field_name) { + static QuickMediaStringView html_attr_or_inner_text(QuickMediaMatchNode *node, const char *field_name) { if(strcmp(field_name, "text") == 0) return quickmedia_html_node_get_text(node); else @@ -32,13 +37,13 @@ namespace QuickMedia { } } - static PluginResult fetch_page_results(const std::string &url, const std::string &website_url, const std::vector<MediaTextQuery> &text_queries, const std::vector<MediaThumbnailQuery> &thumbnail_queries, MediaRelatedCustomHandler *custom_handler, BodyItems &result_items) { + static PluginResult fetch_page_results(const std::string &url, const std::string &website_url, const std::vector<MediaTextQuery> &text_queries, const std::vector<MediaThumbnailQuery> &thumbnail_queries, MediaRelatedCustomHandler *custom_handler, BodyItems &result_items, bool cloudflare_bypass) { std::vector<CommandArg> args; if(!website_url.empty()) args.push_back({ "-H", "referer: " + website_url }); std::string website_data; - if(download_to_string(url, website_data, args, true, true, true) != DownloadResult::OK) + if(download_to_string(url, website_data, args, true, true, cloudflare_bypass) != DownloadResult::OK) return PluginResult::NET_ERR; if(website_data.empty()) @@ -47,7 +52,7 @@ namespace QuickMedia { if(custom_handler && *custom_handler) { std::vector<MediaRelatedItem> media_related_items = (*custom_handler)(website_data); for(MediaRelatedItem &media_related_item : media_related_items) { - auto body_item = BodyItem::create(strip(media_related_item.title)); + auto body_item = BodyItem::create(media_related_item.title); body_item->url = std::move(media_related_item.url); body_item->thumbnail_url = std::move(media_related_item.thumbnail_url); result_items.push_back(std::move(body_item)); @@ -57,7 +62,7 @@ namespace QuickMedia { } QuickMediaHtmlSearch html_search; - int result = quickmedia_html_search_init(&html_search, website_data.c_str()); + int result = quickmedia_html_search_init(&html_search, website_data.c_str(), website_data.size()); if(result != 0) goto cleanup; @@ -68,14 +73,14 @@ namespace QuickMedia { goto cleanup; } - result = quickmedia_html_find_nodes_xpath(&html_search, text_query.html_query, [&text_query, &result_items](QuickMediaHtmlNode *node) { - const char *title_value = html_attr_or_inner_text(node, text_query.title_field); - const char *url_value = html_attr_or_inner_text(node, text_query.url_field); - if(title_value && url_value && (!text_query.url_contains || strstr(url_value, text_query.url_contains))) { - std::string field1_fixed = strip(title_value); + result = quickmedia_html_find_nodes_xpath(&html_search, text_query.html_query, [&text_query, &result_items](QuickMediaMatchNode *node) { + QuickMediaStringView title_value = html_attr_or_inner_text(node, text_query.title_field); + QuickMediaStringView url_value = html_attr_or_inner_text(node, text_query.url_field); + if(title_value.data && url_value.data && (!text_query.url_contains || string_view_contains(url_value, text_query.url_contains))) { + std::string field1_fixed(title_value.data, title_value.size); html_unescape_sequences(field1_fixed); auto item = BodyItem::create(std::move(field1_fixed)); - item->url = strip(url_value); + item->url.assign(url_value.data, url_value.size); result_items.push_back(std::move(item)); } }); @@ -87,10 +92,10 @@ namespace QuickMedia { assert(thumbnail_query.html_query && thumbnail_query.field_name); if(thumbnail_query.html_query && thumbnail_query.field_name) { size_t index = 0; - result = quickmedia_html_find_nodes_xpath(&html_search, thumbnail_query.html_query, [&thumbnail_query, &result_items, &index](QuickMediaHtmlNode *node) { - const char *field_value = html_attr_or_inner_text(node, thumbnail_query.field_name); - if(index < result_items.size() && field_value && (!thumbnail_query.field_contains || strstr(field_value, thumbnail_query.field_contains))) { - result_items[index]->thumbnail_url = strip(field_value); + result = quickmedia_html_find_nodes_xpath(&html_search, thumbnail_query.html_query, [&thumbnail_query, &result_items, &index](QuickMediaMatchNode *node) { + QuickMediaStringView field_value = html_attr_or_inner_text(node, thumbnail_query.field_name); + if(index < result_items.size() && field_value.data && (!thumbnail_query.field_contains || string_view_contains(field_value, thumbnail_query.field_contains))) { + result_items[index]->thumbnail_url.assign(field_value.data, field_value.size); ++index; } }); @@ -111,8 +116,8 @@ namespace QuickMedia { } } - MediaGenericSearchPage::MediaGenericSearchPage(Program *program, const char *website_url, sf::Vector2i thumbnail_max_size) : - Page(program), website_url(website_url ? website_url : ""), thumbnail_max_size(thumbnail_max_size) + MediaGenericSearchPage::MediaGenericSearchPage(Program *program, const char *website_url, sf::Vector2i thumbnail_max_size, bool cloudflare_bypass) : + Page(program), website_url(website_url ? website_url : ""), thumbnail_max_size(thumbnail_max_size), cloudflare_bypass(cloudflare_bypass) { if(!this->website_url.empty()) { if(this->website_url.back() != '/') @@ -128,7 +133,7 @@ namespace QuickMedia { std::string url = search_query.search_template; string_replace_all(url, "%s", url_param_encode(str)); string_replace_all(url, "%p", std::to_string(search_query.page_start + page)); - return fetch_page_results(url, website_url, text_queries, thumbnail_queries, nullptr, result_items); + return fetch_page_results(url, website_url, text_queries, thumbnail_queries, nullptr, result_items, cloudflare_bypass); } PluginResult MediaGenericSearchPage::submit(const std::string&, const std::string &url, std::vector<Tab> &result_tabs) { @@ -137,7 +142,7 @@ namespace QuickMedia { } PluginResult MediaGenericSearchPage::get_related_media(const std::string &url, BodyItems &result_items) { - return fetch_page_results(url, website_url, related_media_text_queries, related_media_thumbnail_queries, &related_custom_handler, result_items); + return fetch_page_results(url, website_url, related_media_text_queries, related_media_thumbnail_queries, &related_custom_handler, result_items, cloudflare_bypass); } MediaGenericSearchPage& MediaGenericSearchPage::search_handler(const char *search_template, int page_start) { diff --git a/src/plugins/NyaaSi.cpp b/src/plugins/NyaaSi.cpp index 5d9e41b..d4667af 100644 --- a/src/plugins/NyaaSi.cpp +++ b/src/plugins/NyaaSi.cpp @@ -291,17 +291,18 @@ namespace QuickMedia { return PluginResult::NET_ERR; QuickMediaHtmlSearch html_search; - int result = quickmedia_html_search_init(&html_search, website_data.c_str()); + int result = quickmedia_html_search_init(&html_search, website_data.c_str(), website_data.size()); if(result != 0) goto cleanup; result = quickmedia_html_find_nodes_xpath(&html_search, "//h3[class='panel-title']", - [](QuickMediaHtmlNode *node, void *userdata) { + [](QuickMediaMatchNode *node, void *userdata) { std::string *title = (std::string*)userdata; - const char *text = quickmedia_html_node_get_text(node); - if(title->empty() && text) { - *title = text; + QuickMediaStringView text = quickmedia_html_node_get_text(node); + if(title->empty() && text.data) { + title->assign(text.data, text.size); } + return 0; }, &title); if(result != 0) @@ -314,16 +315,17 @@ namespace QuickMedia { } result = quickmedia_html_find_nodes_xpath(&html_search, "//div[class='panel-body']//div[class='row']//a", - [](QuickMediaHtmlNode *node, void *userdata) { + [](QuickMediaMatchNode *node, void *userdata) { ResultItemExtra *item_data = (ResultItemExtra*)userdata; - const char *href = quickmedia_html_node_get_attribute_value(node, "href"); - const char *text = quickmedia_html_node_get_text(node); - if(item_data->result_items->empty() && href && text && strncmp(href, "/user/", 6) == 0) { + QuickMediaStringView href = quickmedia_html_node_get_attribute_value(node, "href"); + QuickMediaStringView text = quickmedia_html_node_get_text(node); + if(item_data->result_items->empty() && href.data && text.data && href.size >= 6 && memcmp(href.data, "/user/", 6) == 0) { auto body_item = BodyItem::create(""); - body_item->set_description("Submitter: " + strip(text)); - body_item->url = "https://" + *item_data->domain + "/" + std::string(href); + body_item->set_description("Submitter: " + std::string(text.data, text.size)); + body_item->url = "https://" + *item_data->domain + "/" + std::string(href.data, href.size); item_data->result_items->push_back(std::move(body_item)); } + return 0; }, &result_item_extra); if(result != 0) @@ -335,17 +337,18 @@ namespace QuickMedia { result_items.push_back(std::move(body_item)); } - result_items.front()->set_title(strip(title)); + result_items.front()->set_title(title); result = quickmedia_html_find_nodes_xpath(&html_search, "//div[id='torrent-description']", - [](QuickMediaHtmlNode *node, void *userdata) { + [](QuickMediaMatchNode *node, void *userdata) { std::string *description = (std::string*)userdata; - const char *text = quickmedia_html_node_get_text(node); - if(description->empty() && text) { - std::string desc = strip(text); + QuickMediaStringView text = quickmedia_html_node_get_text(node); + if(description->empty() && text.data) { + std::string desc(text.data, text.size); html_unescape_sequences(desc); *description = std::move(desc); } + return 0; }, &description); if(result != 0) @@ -355,12 +358,13 @@ namespace QuickMedia { result_items.front()->set_description(result_items.front()->get_description() + "\nDescription:\n" + description); result = quickmedia_html_find_nodes_xpath(&html_search, "//div[class='container']//a", - [](QuickMediaHtmlNode *node, void *userdata) { + [](QuickMediaMatchNode *node, void *userdata) { std::string *magnet_url = (std::string*)userdata; - const char *href = quickmedia_html_node_get_attribute_value(node, "href"); - if(magnet_url->empty() && href && strncmp(href, "magnet:?", 8) == 0) { - *magnet_url = href; + QuickMediaStringView href = quickmedia_html_node_get_attribute_value(node, "href"); + if(magnet_url->empty() && href.data && href.size >= 8 && memcmp(href.data, "magnet:?", 8) == 0) { + magnet_url->assign(href.data, href.size); } + return 0; }, &magnet_url); if(result != 0) @@ -377,15 +381,16 @@ namespace QuickMedia { comments_start_index = result_items.size(); result = quickmedia_html_find_nodes_xpath(&html_search, "//div[id='comments']//a", - [](QuickMediaHtmlNode *node, void *userdata) { + [](QuickMediaMatchNode *node, void *userdata) { auto *item_data = (BodyItems*)userdata; - const char *href = quickmedia_html_node_get_attribute_value(node, "href"); - const char *text = quickmedia_html_node_get_text(node); - if(href && text && strncmp(href, "/user/", 6) == 0) { - auto body_item = BodyItem::create(strip(text)); + QuickMediaStringView href = quickmedia_html_node_get_attribute_value(node, "href"); + QuickMediaStringView text = quickmedia_html_node_get_text(node); + if(href.data && text.data && href.size >= 6 && memcmp(href.data, "/user/", 6) == 0) { + auto body_item = BodyItem::create(std::string(text.data, text.size)); //body_item->url = "https://nyaa.si/" + std::string(href); item_data->push_back(std::move(body_item)); } + return 0; }, &result_items); if(result != 0 || result_items.size() == comments_start_index) @@ -396,14 +401,15 @@ namespace QuickMedia { body_item_image_context.index = comments_start_index; result = quickmedia_html_find_nodes_xpath(&html_search, "//div[id='comments']//img[class='avatar']", - [](QuickMediaHtmlNode *node, void *userdata) { + [](QuickMediaMatchNode *node, void *userdata) { auto *item_data = (BodyItemContext*)userdata; - const char *src = quickmedia_html_node_get_attribute_value(node, "src"); - if(src && item_data->index < item_data->body_items->size()) { - (*item_data->body_items)[item_data->index]->thumbnail_url = src; + QuickMediaStringView src = quickmedia_html_node_get_attribute_value(node, "src"); + if(src.data && item_data->index < item_data->body_items->size()) { + (*item_data->body_items)[item_data->index]->thumbnail_url.assign(src.data, src.size); (*item_data->body_items)[item_data->index]->thumbnail_size = sf::Vector2i(120, 120); item_data->index++; } + return 0; }, &body_item_image_context); if(result != 0) @@ -412,15 +418,16 @@ namespace QuickMedia { body_item_image_context.index = comments_start_index; result = quickmedia_html_find_nodes_xpath(&html_search, "//div[id='comments']//div[class='comment-content']", - [](QuickMediaHtmlNode *node, void *userdata) { + [](QuickMediaMatchNode *node, void *userdata) { auto *item_data = (BodyItemContext*)userdata; - const char *text = quickmedia_html_node_get_text(node); - if(text && item_data->index < item_data->body_items->size()) { - std::string desc = strip(text); + QuickMediaStringView text = quickmedia_html_node_get_text(node); + if(text.data && item_data->index < item_data->body_items->size()) { + std::string desc(text.data, text.size); html_unescape_sequences(desc); (*item_data->body_items)[item_data->index]->set_description(std::move(desc)); item_data->index++; } + return 0; }, &body_item_image_context); cleanup: diff --git a/src/plugins/Saucenao.cpp b/src/plugins/Saucenao.cpp index 1278bed..e8d8357 100644 --- a/src/plugins/Saucenao.cpp +++ b/src/plugins/Saucenao.cpp @@ -1,5 +1,4 @@ #include "../../plugins/Saucenao.hpp" -#include "../../include/StringUtils.hpp" #include <quickmedia/HtmlSearch.h> namespace QuickMedia { @@ -19,22 +18,23 @@ namespace QuickMedia { if(download_result != DownloadResult::OK) return download_result_to_plugin_result(download_result); QuickMediaHtmlSearch html_search; - int result = quickmedia_html_search_init(&html_search, website_data.c_str()); + int result = quickmedia_html_search_init(&html_search, website_data.c_str(), website_data.size()); if(result != 0) goto cleanup; result = quickmedia_html_find_nodes_xpath(&html_search, "//td[class='resulttablecontent']", - [](QuickMediaHtmlNode *node, void *userdata) { + [](QuickMediaMatchNode *node, void *userdata) { BodyItems *item_data = (BodyItems*)userdata; - const char *text = quickmedia_html_node_get_text(node); - if(text) { - std::string title = text; + QuickMediaStringView text = quickmedia_html_node_get_text(node); + if(text.data) { + std::string title(text.data, text.size); size_t p_index = title.find("%"); if(p_index != std::string::npos) title = title.erase(0, p_index + 1); - auto item = BodyItem::create(strip(title)); + auto item = BodyItem::create(title); item_data->push_back(std::move(item)); } + return 0; }, &result_items); BodyItemContext body_item_context; @@ -42,16 +42,17 @@ namespace QuickMedia { body_item_context.index = 0; quickmedia_html_find_nodes_xpath(&html_search, "//td[class='resulttableimage']//img", - [](QuickMediaHtmlNode *node, void *userdata) { + [](QuickMediaMatchNode *node, void *userdata) { BodyItemContext *item_data = (BodyItemContext*)userdata; - const char *src = quickmedia_html_node_get_attribute_value(node, "src"); - const char *data_src = quickmedia_html_node_get_attribute_value(node, "data-src"); - const char *image_url = data_src ? data_src : src; - if(image_url && item_data->index < item_data->body_items->size()) { - (*item_data->body_items)[item_data->index]->thumbnail_url = strip(image_url); + QuickMediaStringView src = quickmedia_html_node_get_attribute_value(node, "src"); + QuickMediaStringView data_src = quickmedia_html_node_get_attribute_value(node, "data-src"); + QuickMediaStringView image_url = data_src.data ? data_src : src; + if(image_url.data && item_data->index < item_data->body_items->size()) { + (*item_data->body_items)[item_data->index]->thumbnail_url.assign(image_url.data, image_url.size); (*item_data->body_items)[item_data->index]->thumbnail_size = sf::Vector2i(150, 147); item_data->index++; } + return 0; }, &body_item_context); cleanup: diff --git a/src/plugins/Soundcloud.cpp b/src/plugins/Soundcloud.cpp index abc8c18..7079e46 100644 --- a/src/plugins/Soundcloud.cpp +++ b/src/plugins/Soundcloud.cpp @@ -345,16 +345,17 @@ namespace QuickMedia { if(client_id.empty()) { std::vector<std::string> script_sources; QuickMediaHtmlSearch html_search; - int result = quickmedia_html_search_init(&html_search, website_data.c_str()); + int result = quickmedia_html_search_init(&html_search, website_data.c_str(), website_data.size()); if(result != 0) goto cleanup; result = quickmedia_html_find_nodes_xpath(&html_search, "//script", - [](QuickMediaHtmlNode *node, void *userdata) { + [](QuickMediaMatchNode *node, void *userdata) { std::vector<std::string> *script_sources = (std::vector<std::string>*)userdata; - const char *src = quickmedia_html_node_get_attribute_value(node, "src"); - if(src) - script_sources->push_back(strip(src)); + QuickMediaStringView src = quickmedia_html_node_get_attribute_value(node, "src"); + if(src.data) + script_sources->push_back(std::string(src.data, src.size)); + return 0; }, &script_sources); cleanup: diff --git a/src/plugins/Youtube.cpp b/src/plugins/Youtube.cpp index 24fd448..13655c7 100644 --- a/src/plugins/Youtube.cpp +++ b/src/plugins/Youtube.cpp @@ -1610,25 +1610,25 @@ R"END( if(!subscription_data.inside_entry && subscription_data.author.empty()) { if(parse_type == HTML_PARSE_TAG_START && string_view_equals(&html_parser->tag_name, "title")) { subscription_data.inside_title = true; - return; + return 0; } else if(parse_type == HTML_PARSE_TAG_END && string_view_equals(&html_parser->tag_name, "title")) { subscription_data.inside_title = false; subscription_data.author.assign(html_parser->text_stripped.data, html_parser->text_stripped.size); - return; + return 0; } } if(parse_type == HTML_PARSE_TAG_START && string_view_equals(&html_parser->tag_name, "entry")) { subscription_data.subscription_entry.push_back({}); subscription_data.inside_entry = true; - return; + return 0; } else if(parse_type == HTML_PARSE_TAG_END && string_view_equals(&html_parser->tag_name, "entry")) { subscription_data.inside_entry = false; - return; + return 0; } if(!subscription_data.inside_entry) - return; + return 0; if(string_view_equals(&html_parser->tag_name, "title") && parse_type == HTML_PARSE_TAG_END) { subscription_data.subscription_entry.back().title.assign(html_parser->text_stripped.data, html_parser->text_stripped.size); @@ -1638,6 +1638,8 @@ R"END( std::string published_str(html_parser->text_stripped.data, html_parser->text_stripped.size); subscription_data.subscription_entry.back().published = iso_utc_to_unix_time(published_str.c_str()); } + + return 0; }, &subscription_data); std::vector<YoutubeSubscriptionTaskResult> results; |