From 611d22bf269672ba56f98e12eb6b2a40efdaa5b9 Mon Sep 17 00:00:00 2001 From: dec05eba Date: Sat, 3 Jul 2021 16:23:36 +0200 Subject: Remove dependency on tidy, fix ph, support all 4chan markup Go back to previous page when failing to fetch number of pages --- src/plugins/Fourchan.cpp | 389 +++++++++++++++++++++-------------------------- 1 file changed, 174 insertions(+), 215 deletions(-) (limited to 'src/plugins/Fourchan.cpp') diff --git a/src/plugins/Fourchan.cpp b/src/plugins/Fourchan.cpp index 52024e1..4b2ca61 100644 --- a/src/plugins/Fourchan.cpp +++ b/src/plugins/Fourchan.cpp @@ -3,10 +3,9 @@ #include "../../include/Storage.hpp" #include "../../include/StringUtils.hpp" #include "../../include/NetUtils.hpp" +#include #include #include -#include -#include // API documentation: https://github.com/4chan/4chan-API @@ -37,133 +36,195 @@ namespace QuickMedia { struct CommentPiece { enum class Type { TEXT, - QUOTE, // > - QUOTELINK, // >>POSTNO, - LINE_CONTINUE + QUOTE, // >, Set for span + QUOTE_CONTINUE, // Set for span + QUOTELINK, // >>POSTNO, Set for a + DEADLINK, // Set for span + CROSSBOARD_LINK, // Set for a + CODEBLOCK // Set for pre }; - DataView text; // Set when type is TEXT, QUOTE or QUOTELINK + std::string text; int64_t quote_postnumber = 0; // Set when type is QUOTELINK Type type; }; - static TidyAttr get_attribute_by_name(TidyNode node, const char *name) { - for(TidyAttr attr = tidyAttrFirst(node); attr; attr = tidyAttrNext(attr)) { - const char *attr_name = tidyAttrName(attr); - if(attr_name && strcmp(name, attr_name) == 0) - return attr; - } - return nullptr; - } - - static const char* get_attribute_value(TidyNode node, const char *name) { - TidyAttr attr = get_attribute_by_name(node, name); - if(attr) - return tidyAttrValue(attr); - return nullptr; - } - - static void lstrip_newline(const char *str, size_t size, const char **output_str, size_t *output_size) { - size_t i = 0; - while(i < size && str[i] == '\n') { - ++i; - } - *output_str = str + i; - *output_size = size - i; - } + enum class NodeType { + A, + SPAN, + PRE + }; - static void rstrip_newline(const char *str, size_t size, size_t *output_size) { - ssize_t i = size - 1; - while(i >= 0 && str[i] == '\n') { - --i; + // Returns -1 if no match + static NodeType tag_name_to_node_type(HtmlStringView str) { + if(str.size == 1 && str.data[0] == 'a') { + return NodeType::A; + } else if(str.size == 4 && memcmp(str.data, "span", 4) == 0) { + return NodeType::SPAN; + } else if(str.size == 3 && memcmp(str.data, "pre", 3) == 0) { + return NodeType::PRE; + } else { + return (NodeType)-1; } - *output_size = i + 1; } - static void strip_newline(const char *str, size_t size, const char **output_str, size_t *output_size) { - lstrip_newline(str, size, output_str, output_size); - rstrip_newline(*output_str, *output_size, output_size); - } + struct HtmlNode { + NodeType node_type; + std::string klass; + std::string href; + int output_count = 0; + }; using CommentPieceCallback = std::function; - static int extract_comment_pieces(TidyDoc doc, TidyNode node, CommentPieceCallback &callback) { - for(TidyNode child = tidyGetChild(node); child; child = tidyGetNext(child)) { - const char *node_name = tidyNodeGetName(child); - if(node_name) { - if(strcmp(node_name, "br") == 0) { + struct HtmlParseUserdata { + CommentPieceCallback callback; + std::stack html_node; + }; + + static int html_parse_callback(HtmlParser *html_parser, HtmlParseType parse_type, void *userdata) { + HtmlParseUserdata *parse_userdata = (HtmlParseUserdata*)userdata; + + switch(parse_type) { + case HTML_PARSE_TAG_START: { + if(html_parser->tag_name.size == 2 && memcmp(html_parser->tag_name.data, "br", 2) == 0) { CommentPiece comment_piece; comment_piece.type = CommentPiece::Type::TEXT; - // Warning: Cast from const char* to char* ... - comment_piece.text = { (char*)"\n", 1 }; - callback(comment_piece); - /*} else if(strcmp(node_name, "span") == 0) { - const char *span_class = get_attribute_value(child, "class"); - //fprintf(stderr, "span class: %s\n", span_class); - if(span_class && strcmp(span_class, "quote") == 0) { - CommentPiece comment_piece; - comment_piece.type = CommentPiece::Type::QUOTE; - // Warning: Cast from const char* to char* ... - comment_piece.text = { (char*)"\n", 1 }; - callback(comment_piece); - }*/ + comment_piece.text = "\n"; + parse_userdata->callback(comment_piece); } - } - if(tidyNodeGetType(child) == TidyNode_Text) { - TidyBuffer tidy_buffer; - tidyBufInit(&tidy_buffer); - if(tidyNodeGetText(doc, child, &tidy_buffer)) { - const char *inner_text = (const char*)tidy_buffer.bp; - size_t inner_text_size = tidy_buffer.size; - strip_newline(inner_text, inner_text_size, &inner_text, &inner_text_size); - - const char *node_name = tidyNodeGetName(node); - if(node_name && strcmp(node_name, "a") == 0) { - const char *a_class = get_attribute_value(node, "class"); - const char *a_href = get_attribute_value(node, "href"); - if(a_class && a_href && strcmp(a_class, "quotelink") == 0 && strncmp(a_href, "#p", 2) == 0) { - CommentPiece comment_piece; - comment_piece.type = CommentPiece::Type::QUOTELINK; - comment_piece.quote_postnumber = strtoll(a_href + 2, nullptr, 10); - // Warning: Cast from const char* to char* ... - comment_piece.text = { (char*)inner_text, inner_text_size }; - callback(comment_piece); - tidyBufFree(&tidy_buffer); - continue; - } + const NodeType node_type = tag_name_to_node_type(html_parser->tag_name); + if(node_type != (NodeType)-1) + parse_userdata->html_node.push({ node_type, "", "", 0 }); + break; + } + case HTML_PARSE_TAG_END: { + if(!parse_userdata->html_node.empty()) { + const NodeType node_type = tag_name_to_node_type(html_parser->tag_name); + if(node_type != (NodeType)-1) + parse_userdata->html_node.pop(); + } + break; + } + case HTML_PARSE_ATTRIBUTE: { + if(!parse_userdata->html_node.empty()) { + HtmlNode &html_node = parse_userdata->html_node.top(); + if(html_parser->attribute_key.size == 5 && memcmp(html_parser->attribute_key.data, "class", 5) == 0) { + html_node.klass.assign(html_parser->attribute_value.data, html_parser->attribute_value.size); + } else if(html_parser->attribute_key.size == 4 && memcmp(html_parser->attribute_key.data, "href", 4) == 0) { + html_node.href.assign(html_parser->attribute_value.data, html_parser->attribute_value.size); } + } + break; + } + case HTML_PARSE_TEXT: { + std::string text(html_parser->text.data, html_parser->text.size); + html_unescape_sequences(text); - CommentPiece comment_piece; + CommentPiece comment_piece; + comment_piece.type = CommentPiece::Type::TEXT; + comment_piece.text = std::move(text); + + if(parse_userdata->html_node.empty()) { comment_piece.type = CommentPiece::Type::TEXT; - // Warning: Cast from const char* to char* ... - comment_piece.text = { (char*)inner_text, inner_text_size }; - callback(comment_piece); + } else { + HtmlNode &html_node = parse_userdata->html_node.top(); + switch(html_node.node_type) { + case NodeType::A: { + if(html_node.klass == "quotelink") { + if(string_starts_with(html_node.href, "#p")) { + comment_piece.type = CommentPiece::Type::QUOTELINK; + comment_piece.quote_postnumber = strtoll(html_node.href.c_str() + 2, nullptr, 10); + } else if(string_starts_with(html_node.href, "/")) { + comment_piece.type = CommentPiece::Type::CROSSBOARD_LINK; + } else { + fprintf(stderr, "Unexpected href for quotelink: %s\n", html_node.href.c_str()); + } + } else { + fprintf(stderr, "Unexpected class for a: %s\n", html_node.klass.c_str()); + } + break; + } + case NodeType::SPAN: { + if(html_node.klass == "quote") { + comment_piece.type = html_node.output_count ? CommentPiece::Type::QUOTE : CommentPiece::Type::QUOTE_CONTINUE; + } else if(html_node.klass == "deadlink") { + comment_piece.type = CommentPiece::Type::DEADLINK; + } else { + fprintf(stderr, "Unexpected class for span: %s\n", html_node.klass.c_str()); + } + break; + } + case NodeType::PRE: { + if(html_node.klass == "prettyprint") { + comment_piece.type = CommentPiece::Type::CODEBLOCK; + } else { + fprintf(stderr, "Unexpected class for pre: %s\n", html_node.klass.c_str()); + } + break; + } + } + html_node.output_count++; } - tidyBufFree(&tidy_buffer); - } else { - int res = extract_comment_pieces(doc, child, callback); - if(res != 0) - return res; + + parse_userdata->callback(comment_piece); + break; + } + case HTML_PARSE_JAVASCRIPT_CODE: { + break; } } + return 0; } static void extract_comment_pieces(const char *html_source, size_t size, CommentPieceCallback callback) { - TidyDoc doc = tidyCreate(); - tidyOptSetBool(doc, TidyShowWarnings, no); - tidyOptSetInt(doc, TidyUseCustomTags, 1); - tidyOptSetInt(doc, TidyWrapLen, 0); - if(tidyParseString(doc, html_source) < 0) { - CommentPiece comment_piece; - comment_piece.type = CommentPiece::Type::TEXT; - // Warning: Cast from const char* to char* ... - comment_piece.text = { (char*)html_source, size }; - callback(comment_piece); - } else { - extract_comment_pieces(doc, tidyGetRoot(doc), callback); - } - tidyRelease(doc); + HtmlParseUserdata parse_userdata; + parse_userdata.callback = std::move(callback); + html_parser_parse(html_source, size, html_parse_callback, &parse_userdata); + } + + static std::string html_to_text(const char *html_source, size_t size, std::unordered_map &comment_by_postno, BodyItems &result_items, size_t body_item_index) { + std::string comment_text; + extract_comment_pieces(html_source, size, + [&comment_text, &comment_by_postno, &result_items, body_item_index](const CommentPiece &cp) { + switch(cp.type) { + case CommentPiece::Type::TEXT: + comment_text += std::move(cp.text); + break; + case CommentPiece::Type::QUOTE: + comment_text += std::move(cp.text); + break; + case CommentPiece::Type::QUOTE_CONTINUE: + comment_text += std::move(cp.text); + break; + case CommentPiece::Type::QUOTELINK: { + comment_text += std::move(cp.text); + auto it = comment_by_postno.find(cp.quote_postnumber); + if(it == comment_by_postno.end()) { + // TODO: Link this quote to a 4chan archive that still has the quoted comment (if available) + comment_text += " (Dead)"; + } else { + result_items[body_item_index]->replies_to.push_back(it->second); + result_items[it->second]->replies.push_back(body_item_index); + } + break; + } + case CommentPiece::Type::DEADLINK: + // TODO: Link this quote to a 4chan archive that still has the quoted comment (if available) + comment_text += std::move(cp.text) + " (Dead)"; + break; + case CommentPiece::Type::CROSSBOARD_LINK: + // TODO: Link this to another thread and allow navigating to it + comment_text += std::move(cp.text) + " (Cross-thread)"; + break; + case CommentPiece::Type::CODEBLOCK: + // TODO: Use a different colored background and use a monospace font + comment_text += std::move(cp.text); + break; + } + }); + return comment_text; } PluginResult FourchanBoardsPage::submit(const std::string &title, const std::string &url, std::vector &result_tabs) { @@ -264,68 +325,14 @@ namespace QuickMedia { author_str += " #" + std::to_string(post_num.asInt64()); - std::string comment_text; - extract_comment_pieces(sub_begin, sub_end - sub_begin, - [&comment_text](const CommentPiece &cp) { - switch(cp.type) { - case CommentPiece::Type::TEXT: - comment_text.append(cp.text.data, cp.text.size); - break; - case CommentPiece::Type::QUOTE: - //comment_text += '>'; - //comment_text.append(cp.text.data, cp.text.size); - //comment_text += '\n'; - break; - case CommentPiece::Type::QUOTELINK: { - comment_text.append(cp.text.data, cp.text.size); - break; - } - case CommentPiece::Type::LINE_CONTINUE: { - if(!comment_text.empty() && comment_text.back() == '\n') { - comment_text.pop_back(); - } - break; - } - } - } - ); + std::string comment_text = html_to_text(sub_begin, sub_end - sub_begin, comment_by_postno, result_items, body_item_index); if(!comment_text.empty()) comment_text += '\n'; - extract_comment_pieces(comment_begin, comment_end - comment_begin, - [&comment_text, &comment_by_postno, &result_items, body_item_index](const CommentPiece &cp) { - switch(cp.type) { - case CommentPiece::Type::TEXT: - comment_text.append(cp.text.data, cp.text.size); - break; - case CommentPiece::Type::QUOTE: - //comment_text += '>'; - //comment_text.append(cp.text.data, cp.text.size); - //comment_text += '\n'; - break; - case CommentPiece::Type::QUOTELINK: { - comment_text.append(cp.text.data, cp.text.size); - auto it = comment_by_postno.find(cp.quote_postnumber); - if(it == comment_by_postno.end()) { - // TODO: Link this quote to a 4chan archive that still has the quoted comment (if available) - comment_text += "(dead)"; - } else { - result_items[body_item_index]->replies_to.push_back(it->second); - result_items[it->second]->replies.push_back(body_item_index); - } - break; - } - case CommentPiece::Type::LINE_CONTINUE: { - if(!comment_text.empty() && comment_text.back() == '\n') { - comment_text.pop_back(); - } - break; - } - } - } - ); + + comment_text += html_to_text(comment_begin, comment_end - comment_begin, comment_by_postno, result_items, body_item_index); if(!comment_text.empty() && comment_text.back() == '\n') - comment_text.back() = ' '; - html_unescape_sequences(comment_text); + comment_text.pop_back(); + BodyItem *body_item = result_items[body_item_index].get(); body_item->set_title(std::move(comment_text)); body_item->set_author(std::move(author_str)); @@ -369,6 +376,7 @@ namespace QuickMedia { if(!json_root.isArray()) return PluginResult::ERR; + std::unordered_map comment_by_postno; for(const Json::Value &page_data : json_root) { if(!page_data.isObject()) continue; @@ -395,61 +403,11 @@ namespace QuickMedia { if(!thread_num.isNumeric()) continue; - std::string title_text; - extract_comment_pieces(sub_begin, sub_end - sub_begin, - [&title_text](const CommentPiece &cp) { - switch(cp.type) { - case CommentPiece::Type::TEXT: - title_text.append(cp.text.data, cp.text.size); - break; - case CommentPiece::Type::QUOTE: - //title_text += '>'; - //title_text.append(cp.text.data, cp.text.size); - //comment_text += '\n'; - break; - case CommentPiece::Type::QUOTELINK: { - title_text.append(cp.text.data, cp.text.size); - break; - } - case CommentPiece::Type::LINE_CONTINUE: { - if(!title_text.empty() && title_text.back() == '\n') { - title_text.pop_back(); - } - break; - } - } - } - ); + std::string title_text = html_to_text(sub_begin, sub_end - sub_begin, comment_by_postno, result_items, 0); if(!title_text.empty() && title_text.back() == '\n') title_text.back() = ' '; - html_unescape_sequences(title_text); - - std::string comment_text; - extract_comment_pieces(comment_begin, comment_end - comment_begin, - [&comment_text](const CommentPiece &cp) { - switch(cp.type) { - case CommentPiece::Type::TEXT: - comment_text.append(cp.text.data, cp.text.size); - break; - case CommentPiece::Type::QUOTE: - //comment_text += '>'; - //comment_text.append(cp.text.data, cp.text.size); - //comment_text += '\n'; - break; - case CommentPiece::Type::QUOTELINK: { - comment_text.append(cp.text.data, cp.text.size); - break; - } - case CommentPiece::Type::LINE_CONTINUE: { - if(!comment_text.empty() && comment_text.back() == '\n') { - comment_text.pop_back(); - } - break; - } - } - } - ); - html_unescape_sequences(comment_text); + + std::string comment_text = html_to_text(comment_begin, comment_end - comment_begin, comment_by_postno, result_items, 0); // TODO: Do the same when wrapping is implemented // TODO: Remove this int num_lines = 0; @@ -462,6 +420,7 @@ namespace QuickMedia { } } } + auto body_item = BodyItem::create(std::move(comment_text)); body_item->set_author(std::move(title_text)); body_item->url = std::to_string(thread_num.asInt64()); -- cgit v1.2.3