Remove dependency on tidy, fix ph, support all 4chan markup

Go back to previous page when failing to fetch number of pages
author: dec05eba <dec05eba@protonmail.com> 2021-07-03 16:23:36 +0200
committer: dec05eba <dec05eba@protonmail.com> 2021-07-03 18:34:37 +0200
commit: 611d22bf269672ba56f98e12eb6b2a40efdaa5b9 (patch)
tree: 5dfd3e98fd08fa7cb6cb82c565b538cc891b6b98 /src/plugins/Fourchan.cpp
parent: 496f71413df2468a9d3329355ffef08280219808 (diff)
1 files changed, 174 insertions, 215 deletions
diff --git a/src/plugins/Fourchan.cpp b/src/plugins/Fourchan.cpp
index 52024e1..4b2ca61 100644
--- a/src/plugins/Fourchan.cpp
+++ b/src/plugins/Fourchan.cpp
@@ -3,10 +3,9 @@
 #include "../../include/Storage.hpp"
 #include "../../include/StringUtils.hpp"
 #include "../../include/NetUtils.hpp"
+#include <HtmlParser.h>
 #include <json/reader.h>
 #include <string.h>
-#include <tidy.h>
-#include <tidybuffio.h>
 
 // API documentation: https://github.com/4chan/4chan-API
 
@@ -37,133 +36,195 @@ namespace QuickMedia {
     struct CommentPiece {
         enum class Type {
             TEXT,
-            QUOTE, // >
-            QUOTELINK, // >>POSTNO,
-            LINE_CONTINUE
+            QUOTE, // >, Set for span
+            QUOTE_CONTINUE, // Set for span
+            QUOTELINK, // >>POSTNO, Set for a
+            DEADLINK, // Set for span
+            CROSSBOARD_LINK, // Set for a
+            CODEBLOCK // Set for pre
         };
 
-        DataView text; // Set when type is TEXT, QUOTE or QUOTELINK
+        std::string text;
         int64_t quote_postnumber = 0; // Set when type is QUOTELINK
         Type type;
     };
 
-    static TidyAttr get_attribute_by_name(TidyNode node, const char *name) {
-        for(TidyAttr attr = tidyAttrFirst(node); attr; attr = tidyAttrNext(attr)) {
-            const char *attr_name = tidyAttrName(attr);
-            if(attr_name && strcmp(name, attr_name) == 0)
-                return attr;
-        }
-        return nullptr;
-    }
-
-    static const char* get_attribute_value(TidyNode node, const char *name) {
-        TidyAttr attr = get_attribute_by_name(node, name);
-        if(attr)
-            return tidyAttrValue(attr);
-        return nullptr;
-    }
-
-    static void lstrip_newline(const char *str, size_t size, const char **output_str, size_t *output_size) {
-        size_t i = 0;
-        while(i < size && str[i] == '\n') {
-            ++i;
-        }
-        *output_str = str + i;
-        *output_size = size - i;
-    }
+    enum class NodeType {
+        A,
+        SPAN,
+        PRE
+    };
 
-    static void rstrip_newline(const char *str, size_t size, size_t *output_size) {
-        ssize_t i = size - 1;
-        while(i >= 0 && str[i] == '\n') {
-            --i;
+    // Returns -1 if no match
+    static NodeType tag_name_to_node_type(HtmlStringView str) {
+        if(str.size == 1 && str.data[0] == 'a') {
+            return NodeType::A;
+        } else if(str.size == 4 && memcmp(str.data, "span", 4) == 0) {
+            return NodeType::SPAN;
+        } else if(str.size == 3 && memcmp(str.data, "pre", 3) == 0) {
+            return NodeType::PRE;
+        } else {
+            return (NodeType)-1;
         }
-        *output_size = i + 1;
     }
 
-    static void strip_newline(const char *str, size_t size, const char **output_str, size_t *output_size) {
-        lstrip_newline(str, size, output_str, output_size);
-        rstrip_newline(*output_str, *output_size, output_size);
-    }
+    struct HtmlNode {
+        NodeType node_type;
+        std::string klass;
+        std::string href;
+        int output_count = 0;
+    };
 
     using CommentPieceCallback = std::function<void(const CommentPiece&)>;
-    static int extract_comment_pieces(TidyDoc doc, TidyNode node, CommentPieceCallback &callback) {
-        for(TidyNode child = tidyGetChild(node); child; child = tidyGetNext(child)) {
-            const char *node_name = tidyNodeGetName(child);
-            if(node_name) {
-                if(strcmp(node_name, "br") == 0) {
+    struct HtmlParseUserdata {
+        CommentPieceCallback callback;
+        std::stack<HtmlNode> html_node;
+    };
+
+    static int html_parse_callback(HtmlParser *html_parser, HtmlParseType parse_type, void *userdata) {
+        HtmlParseUserdata *parse_userdata = (HtmlParseUserdata*)userdata;
+
+        switch(parse_type) {
+            case HTML_PARSE_TAG_START: {
+                if(html_parser->tag_name.size == 2 && memcmp(html_parser->tag_name.data, "br", 2) == 0) {
                     CommentPiece comment_piece;
                     comment_piece.type = CommentPiece::Type::TEXT;
-                    // Warning: Cast from const char* to char* ...
-                    comment_piece.text = { (char*)"\n", 1 };
-                    callback(comment_piece);
-                /*} else if(strcmp(node_name, "span") == 0) {
-                    const char *span_class = get_attribute_value(child, "class");
-                    //fprintf(stderr, "span class: %s\n", span_class);
-                    if(span_class && strcmp(span_class, "quote") == 0) {
-                        CommentPiece comment_piece;
-                        comment_piece.type = CommentPiece::Type::QUOTE;
-                        // Warning: Cast from const char* to char* ...
-                        comment_piece.text = { (char*)"\n", 1 };
-                        callback(comment_piece);
-                    }*/
+                    comment_piece.text = "\n";
+                    parse_userdata->callback(comment_piece);
                 }
-            }
 
-            if(tidyNodeGetType(child) == TidyNode_Text) {
-                TidyBuffer tidy_buffer;
-                tidyBufInit(&tidy_buffer);
-                if(tidyNodeGetText(doc, child, &tidy_buffer)) {
-                    const char *inner_text = (const char*)tidy_buffer.bp;
-                    size_t inner_text_size = tidy_buffer.size;
-                    strip_newline(inner_text, inner_text_size, &inner_text, &inner_text_size);
-
-                    const char *node_name = tidyNodeGetName(node);
-                    if(node_name && strcmp(node_name, "a") == 0) {
-                        const char *a_class = get_attribute_value(node, "class");
-                        const char *a_href = get_attribute_value(node, "href");
-                        if(a_class && a_href && strcmp(a_class, "quotelink") == 0 && strncmp(a_href, "#p", 2) == 0) {
-                            CommentPiece comment_piece;
-                            comment_piece.type = CommentPiece::Type::QUOTELINK;
-                            comment_piece.quote_postnumber = strtoll(a_href + 2, nullptr, 10);
-                            // Warning: Cast from const char* to char* ...
-                            comment_piece.text = { (char*)inner_text, inner_text_size };
-                            callback(comment_piece);
-                            tidyBufFree(&tidy_buffer);
-                            continue;
-                        }
+                const NodeType node_type = tag_name_to_node_type(html_parser->tag_name);
+                if(node_type != (NodeType)-1)
+                    parse_userdata->html_node.push({ node_type, "", "", 0 });
+                break;
+            }
+            case HTML_PARSE_TAG_END: {
+                if(!parse_userdata->html_node.empty()) {
+                    const NodeType node_type = tag_name_to_node_type(html_parser->tag_name);
+                    if(node_type != (NodeType)-1)
+                        parse_userdata->html_node.pop();
+                }
+                break;
+            }
+            case HTML_PARSE_ATTRIBUTE: {
+                if(!parse_userdata->html_node.empty()) {
+                    HtmlNode &html_node = parse_userdata->html_node.top();
+                    if(html_parser->attribute_key.size == 5 && memcmp(html_parser->attribute_key.data, "class", 5) == 0) {
+                        html_node.klass.assign(html_parser->attribute_value.data, html_parser->attribute_value.size);
+                    } else if(html_parser->attribute_key.size == 4 && memcmp(html_parser->attribute_key.data, "href", 4) == 0) {
+                        html_node.href.assign(html_parser->attribute_value.data, html_parser->attribute_value.size);
                     }
+                }
+                break;
+            }
+            case HTML_PARSE_TEXT: {
+                std::string text(html_parser->text.data, html_parser->text.size);
+                html_unescape_sequences(text);
 
-                    CommentPiece comment_piece;
+                CommentPiece comment_piece;
+                comment_piece.type = CommentPiece::Type::TEXT;
+                comment_piece.text = std::move(text);
+
+                if(parse_userdata->html_node.empty()) {
                     comment_piece.type = CommentPiece::Type::TEXT;
-                    // Warning: Cast from const char* to char* ...
-                    comment_piece.text = { (char*)inner_text, inner_text_size };
-                    callback(comment_piece);
+                } else {
+                    HtmlNode &html_node = parse_userdata->html_node.top();
+                    switch(html_node.node_type) {
+                        case NodeType::A: {
+                            if(html_node.klass == "quotelink") {
+                                if(string_starts_with(html_node.href, "#p")) {
+                                    comment_piece.type = CommentPiece::Type::QUOTELINK;
+                                    comment_piece.quote_postnumber = strtoll(html_node.href.c_str() + 2, nullptr, 10);
+                                } else if(string_starts_with(html_node.href, "/")) {
+                                    comment_piece.type = CommentPiece::Type::CROSSBOARD_LINK;
+                                } else {
+                                    fprintf(stderr, "Unexpected href for quotelink: %s\n", html_node.href.c_str());
+                                }
+                            } else {
+                                fprintf(stderr, "Unexpected class for a: %s\n", html_node.klass.c_str());
+                            }
+                            break;
+                        }
+                        case NodeType::SPAN: {
+                            if(html_node.klass == "quote") {
+                                comment_piece.type = html_node.output_count ? CommentPiece::Type::QUOTE : CommentPiece::Type::QUOTE_CONTINUE;
+                            } else if(html_node.klass == "deadlink") {
+                                comment_piece.type = CommentPiece::Type::DEADLINK;
+                            } else {
+                                fprintf(stderr, "Unexpected class for span: %s\n", html_node.klass.c_str());
+                            }
+                            break;
+                        }
+                        case NodeType::PRE: {
+                            if(html_node.klass == "prettyprint") {
+                                comment_piece.type = CommentPiece::Type::CODEBLOCK;
+                            } else {
+                                fprintf(stderr, "Unexpected class for pre: %s\n", html_node.klass.c_str());
+                            }
+                            break;
+                        }
+                    }
+                    html_node.output_count++;
                 }
-                tidyBufFree(&tidy_buffer);
-            } else {
-                int res = extract_comment_pieces(doc, child, callback);
-                if(res != 0)
-                    return res;
+
+                parse_userdata->callback(comment_piece);
+                break;
+            }
+            case HTML_PARSE_JAVASCRIPT_CODE: {
+                break;
             }
         }
+
         return 0;
     }
 
     static void extract_comment_pieces(const char *html_source, size_t size, CommentPieceCallback callback) {
-        TidyDoc doc = tidyCreate();
-        tidyOptSetBool(doc, TidyShowWarnings, no);
-        tidyOptSetInt(doc, TidyUseCustomTags, 1);
-        tidyOptSetInt(doc, TidyWrapLen, 0);
-        if(tidyParseString(doc, html_source) < 0) {
-            CommentPiece comment_piece;
-            comment_piece.type = CommentPiece::Type::TEXT;
-            // Warning: Cast from const char* to char* ...
-            comment_piece.text = { (char*)html_source, size };
-            callback(comment_piece);
-        } else {
-            extract_comment_pieces(doc, tidyGetRoot(doc), callback);
-        }
-        tidyRelease(doc);
+        HtmlParseUserdata parse_userdata;
+        parse_userdata.callback = std::move(callback);
+        html_parser_parse(html_source, size, html_parse_callback, &parse_userdata);
+    }
+
+    static std::string html_to_text(const char *html_source, size_t size, std::unordered_map<int64_t, size_t> &comment_by_postno, BodyItems &result_items, size_t body_item_index) {
+        std::string comment_text;
+        extract_comment_pieces(html_source, size,
+            [&comment_text, &comment_by_postno, &result_items, body_item_index](const CommentPiece &cp) {
+                switch(cp.type) {
+                    case CommentPiece::Type::TEXT:
+                        comment_text += std::move(cp.text);
+                        break;
+                    case CommentPiece::Type::QUOTE:
+                        comment_text += std::move(cp.text);
+                        break;
+                    case CommentPiece::Type::QUOTE_CONTINUE:
+                        comment_text += std::move(cp.text);
+                        break;
+                    case CommentPiece::Type::QUOTELINK: {
+                        comment_text += std::move(cp.text);
+                        auto it = comment_by_postno.find(cp.quote_postnumber);
+                        if(it == comment_by_postno.end()) {
+                            // TODO: Link this quote to a 4chan archive that still has the quoted comment (if available)
+                            comment_text += " (Dead)";
+                        } else {
+                            result_items[body_item_index]->replies_to.push_back(it->second);
+                            result_items[it->second]->replies.push_back(body_item_index);
+                        }
+                        break;
+                    }
+                    case CommentPiece::Type::DEADLINK:
+                        // TODO: Link this quote to a 4chan archive that still has the quoted comment (if available)
+                        comment_text += std::move(cp.text) + " (Dead)";
+                        break;
+                    case CommentPiece::Type::CROSSBOARD_LINK:
+                        // TODO: Link this to another thread and allow navigating to it
+                        comment_text += std::move(cp.text) + " (Cross-thread)";
+                        break;
+                    case CommentPiece::Type::CODEBLOCK:
+                        // TODO: Use a different colored background and use a monospace font
+                        comment_text += std::move(cp.text);
+                        break;
+                }
+            });
+        return comment_text;
     }
 
     PluginResult FourchanBoardsPage::submit(const std::string &title, const std::string &url, std::vector<Tab> &result_tabs) {
@@ -264,68 +325,14 @@ namespace QuickMedia {
 
             author_str += " #" + std::to_string(post_num.asInt64());
 
-            std::string comment_text;
-            extract_comment_pieces(sub_begin, sub_end - sub_begin,
-                [&comment_text](const CommentPiece &cp) {
-                    switch(cp.type) {
-                        case CommentPiece::Type::TEXT:
-                            comment_text.append(cp.text.data, cp.text.size);
-                            break;
-                        case CommentPiece::Type::QUOTE:
-                            //comment_text += '>';
-                            //comment_text.append(cp.text.data, cp.text.size);
-                            //comment_text += '\n';
-                            break;
-                        case CommentPiece::Type::QUOTELINK: {
-                            comment_text.append(cp.text.data, cp.text.size);
-                            break;
-                        }
-                        case CommentPiece::Type::LINE_CONTINUE: {
-                            if(!comment_text.empty() && comment_text.back() == '\n') {
-                                comment_text.pop_back();
-                            }
-                            break;
-                        }
-                    }
-                }
-            );
+            std::string comment_text = html_to_text(sub_begin, sub_end - sub_begin, comment_by_postno, result_items, body_item_index);
             if(!comment_text.empty())
                 comment_text += '\n';
-            extract_comment_pieces(comment_begin, comment_end - comment_begin,
-                [&comment_text, &comment_by_postno, &result_items, body_item_index](const CommentPiece &cp) {
-                    switch(cp.type) {
-                        case CommentPiece::Type::TEXT:
-                            comment_text.append(cp.text.data, cp.text.size);
-                            break;
-                        case CommentPiece::Type::QUOTE:
-                            //comment_text += '>';
-                            //comment_text.append(cp.text.data, cp.text.size);
-                            //comment_text += '\n';
-                            break;
-                        case CommentPiece::Type::QUOTELINK: {
-                            comment_text.append(cp.text.data, cp.text.size);
-                            auto it = comment_by_postno.find(cp.quote_postnumber);
-                            if(it == comment_by_postno.end()) {
-                                // TODO: Link this quote to a 4chan archive that still has the quoted comment (if available)
-                                comment_text += "(dead)";
-                            } else {
-                                result_items[body_item_index]->replies_to.push_back(it->second);
-                                result_items[it->second]->replies.push_back(body_item_index);
-                            }
-                            break;
-                        }
-                        case CommentPiece::Type::LINE_CONTINUE: {
-                            if(!comment_text.empty() && comment_text.back() == '\n') {
-                                comment_text.pop_back();
-                            }
-                            break;
-                        }
-                    }
-                }
-            );
+
+            comment_text += html_to_text(comment_begin, comment_end - comment_begin, comment_by_postno, result_items, body_item_index);
             if(!comment_text.empty() && comment_text.back() == '\n')
-                comment_text.back() = ' ';
-            html_unescape_sequences(comment_text);
+                comment_text.pop_back();
+
             BodyItem *body_item = result_items[body_item_index].get();
             body_item->set_title(std::move(comment_text));
             body_item->set_author(std::move(author_str));
@@ -369,6 +376,7 @@ namespace QuickMedia {
         if(!json_root.isArray())
             return PluginResult::ERR;
 
+        std::unordered_map<int64_t, size_t> comment_by_postno;
         for(const Json::Value &page_data : json_root) {
             if(!page_data.isObject())
                 continue;
@@ -395,61 +403,11 @@ namespace QuickMedia {
                 if(!thread_num.isNumeric())
                     continue;
 
-                std::string title_text;
-                extract_comment_pieces(sub_begin, sub_end - sub_begin,
-                    [&title_text](const CommentPiece &cp) {
-                        switch(cp.type) {
-                            case CommentPiece::Type::TEXT:
-                                title_text.append(cp.text.data, cp.text.size);
-                                break;
-                            case CommentPiece::Type::QUOTE:
-                                //title_text += '>';
-                                //title_text.append(cp.text.data, cp.text.size);
-                                //comment_text += '\n';
-                                break;
-                            case CommentPiece::Type::QUOTELINK: {
-                                title_text.append(cp.text.data, cp.text.size);
-                                break;
-                            }
-                            case CommentPiece::Type::LINE_CONTINUE: {
-                                if(!title_text.empty() && title_text.back() == '\n') {
-                                    title_text.pop_back();
-                                }
-                                break;
-                            }
-                        }
-                    }
-                );
+                std::string title_text = html_to_text(sub_begin, sub_end - sub_begin, comment_by_postno, result_items, 0);
                 if(!title_text.empty() && title_text.back() == '\n')
                     title_text.back() = ' ';
-                html_unescape_sequences(title_text);
-
-                std::string comment_text;
-                extract_comment_pieces(comment_begin, comment_end - comment_begin,
-                    [&comment_text](const CommentPiece &cp) {
-                        switch(cp.type) {
-                            case CommentPiece::Type::TEXT:
-                                comment_text.append(cp.text.data, cp.text.size);
-                                break;
-                            case CommentPiece::Type::QUOTE:
-                                //comment_text += '>';
-                                //comment_text.append(cp.text.data, cp.text.size);
-                                //comment_text += '\n';
-                                break;
-                            case CommentPiece::Type::QUOTELINK: {
-                                comment_text.append(cp.text.data, cp.text.size);
-                                break;
-                            }
-                            case CommentPiece::Type::LINE_CONTINUE: {
-                                if(!comment_text.empty() && comment_text.back() == '\n') {
-                                    comment_text.pop_back();
-                                }
-                                break;
-                            }
-                        }
-                    }
-                );
-                html_unescape_sequences(comment_text);
+
+                std::string comment_text = html_to_text(comment_begin, comment_end - comment_begin, comment_by_postno, result_items, 0);
                 // TODO: Do the same when wrapping is implemented
                 // TODO: Remove this
                 int num_lines = 0;
@@ -462,6 +420,7 @@ namespace QuickMedia {
                         }
                     }
                 }
+
                 auto body_item = BodyItem::create(std::move(comment_text));
                 body_item->set_author(std::move(title_text));
                 body_item->url = std::to_string(thread_num.asInt64());
author	dec05eba <dec05eba@protonmail.com>	2021-07-03 16:23:36 +0200
committer	dec05eba <dec05eba@protonmail.com>	2021-07-03 18:34:37 +0200
commit	611d22bf269672ba56f98e12eb6b2a40efdaa5b9 (patch)
tree	5dfd3e98fd08fa7cb6cb82c565b538cc891b6b98 /src/plugins/Fourchan.cpp
parent	496f71413df2468a9d3329355ffef08280219808 (diff)