aboutsummaryrefslogtreecommitdiff
path: root/src/plugins/Fourchan.cpp
diff options
context:
space:
mode:
authordec05eba <dec05eba@protonmail.com>2021-07-03 16:23:36 +0200
committerdec05eba <dec05eba@protonmail.com>2021-07-03 18:34:37 +0200
commit611d22bf269672ba56f98e12eb6b2a40efdaa5b9 (patch)
tree5dfd3e98fd08fa7cb6cb82c565b538cc891b6b98 /src/plugins/Fourchan.cpp
parent496f71413df2468a9d3329355ffef08280219808 (diff)
Remove dependency on tidy, fix ph, support all 4chan markup
Go back to previous page when failing to fetch number of pages
Diffstat (limited to 'src/plugins/Fourchan.cpp')
-rw-r--r--src/plugins/Fourchan.cpp389
1 files changed, 174 insertions, 215 deletions
diff --git a/src/plugins/Fourchan.cpp b/src/plugins/Fourchan.cpp
index 52024e1..4b2ca61 100644
--- a/src/plugins/Fourchan.cpp
+++ b/src/plugins/Fourchan.cpp
@@ -3,10 +3,9 @@
#include "../../include/Storage.hpp"
#include "../../include/StringUtils.hpp"
#include "../../include/NetUtils.hpp"
+#include <HtmlParser.h>
#include <json/reader.h>
#include <string.h>
-#include <tidy.h>
-#include <tidybuffio.h>
// API documentation: https://github.com/4chan/4chan-API
@@ -37,133 +36,195 @@ namespace QuickMedia {
struct CommentPiece {
enum class Type {
TEXT,
- QUOTE, // >
- QUOTELINK, // >>POSTNO,
- LINE_CONTINUE
+ QUOTE, // >, Set for span
+ QUOTE_CONTINUE, // Set for span
+ QUOTELINK, // >>POSTNO, Set for a
+ DEADLINK, // Set for span
+ CROSSBOARD_LINK, // Set for a
+ CODEBLOCK // Set for pre
};
- DataView text; // Set when type is TEXT, QUOTE or QUOTELINK
+ std::string text;
int64_t quote_postnumber = 0; // Set when type is QUOTELINK
Type type;
};
- static TidyAttr get_attribute_by_name(TidyNode node, const char *name) {
- for(TidyAttr attr = tidyAttrFirst(node); attr; attr = tidyAttrNext(attr)) {
- const char *attr_name = tidyAttrName(attr);
- if(attr_name && strcmp(name, attr_name) == 0)
- return attr;
- }
- return nullptr;
- }
-
- static const char* get_attribute_value(TidyNode node, const char *name) {
- TidyAttr attr = get_attribute_by_name(node, name);
- if(attr)
- return tidyAttrValue(attr);
- return nullptr;
- }
-
- static void lstrip_newline(const char *str, size_t size, const char **output_str, size_t *output_size) {
- size_t i = 0;
- while(i < size && str[i] == '\n') {
- ++i;
- }
- *output_str = str + i;
- *output_size = size - i;
- }
+ enum class NodeType {
+ A,
+ SPAN,
+ PRE
+ };
- static void rstrip_newline(const char *str, size_t size, size_t *output_size) {
- ssize_t i = size - 1;
- while(i >= 0 && str[i] == '\n') {
- --i;
+ // Returns -1 if no match
+ static NodeType tag_name_to_node_type(HtmlStringView str) {
+ if(str.size == 1 && str.data[0] == 'a') {
+ return NodeType::A;
+ } else if(str.size == 4 && memcmp(str.data, "span", 4) == 0) {
+ return NodeType::SPAN;
+ } else if(str.size == 3 && memcmp(str.data, "pre", 3) == 0) {
+ return NodeType::PRE;
+ } else {
+ return (NodeType)-1;
}
- *output_size = i + 1;
}
- static void strip_newline(const char *str, size_t size, const char **output_str, size_t *output_size) {
- lstrip_newline(str, size, output_str, output_size);
- rstrip_newline(*output_str, *output_size, output_size);
- }
+ struct HtmlNode {
+ NodeType node_type;
+ std::string klass;
+ std::string href;
+ int output_count = 0;
+ };
using CommentPieceCallback = std::function<void(const CommentPiece&)>;
- static int extract_comment_pieces(TidyDoc doc, TidyNode node, CommentPieceCallback &callback) {
- for(TidyNode child = tidyGetChild(node); child; child = tidyGetNext(child)) {
- const char *node_name = tidyNodeGetName(child);
- if(node_name) {
- if(strcmp(node_name, "br") == 0) {
+ struct HtmlParseUserdata {
+ CommentPieceCallback callback;
+ std::stack<HtmlNode> html_node;
+ };
+
+ static int html_parse_callback(HtmlParser *html_parser, HtmlParseType parse_type, void *userdata) {
+ HtmlParseUserdata *parse_userdata = (HtmlParseUserdata*)userdata;
+
+ switch(parse_type) {
+ case HTML_PARSE_TAG_START: {
+ if(html_parser->tag_name.size == 2 && memcmp(html_parser->tag_name.data, "br", 2) == 0) {
CommentPiece comment_piece;
comment_piece.type = CommentPiece::Type::TEXT;
- // Warning: Cast from const char* to char* ...
- comment_piece.text = { (char*)"\n", 1 };
- callback(comment_piece);
- /*} else if(strcmp(node_name, "span") == 0) {
- const char *span_class = get_attribute_value(child, "class");
- //fprintf(stderr, "span class: %s\n", span_class);
- if(span_class && strcmp(span_class, "quote") == 0) {
- CommentPiece comment_piece;
- comment_piece.type = CommentPiece::Type::QUOTE;
- // Warning: Cast from const char* to char* ...
- comment_piece.text = { (char*)"\n", 1 };
- callback(comment_piece);
- }*/
+ comment_piece.text = "\n";
+ parse_userdata->callback(comment_piece);
}
- }
- if(tidyNodeGetType(child) == TidyNode_Text) {
- TidyBuffer tidy_buffer;
- tidyBufInit(&tidy_buffer);
- if(tidyNodeGetText(doc, child, &tidy_buffer)) {
- const char *inner_text = (const char*)tidy_buffer.bp;
- size_t inner_text_size = tidy_buffer.size;
- strip_newline(inner_text, inner_text_size, &inner_text, &inner_text_size);
-
- const char *node_name = tidyNodeGetName(node);
- if(node_name && strcmp(node_name, "a") == 0) {
- const char *a_class = get_attribute_value(node, "class");
- const char *a_href = get_attribute_value(node, "href");
- if(a_class && a_href && strcmp(a_class, "quotelink") == 0 && strncmp(a_href, "#p", 2) == 0) {
- CommentPiece comment_piece;
- comment_piece.type = CommentPiece::Type::QUOTELINK;
- comment_piece.quote_postnumber = strtoll(a_href + 2, nullptr, 10);
- // Warning: Cast from const char* to char* ...
- comment_piece.text = { (char*)inner_text, inner_text_size };
- callback(comment_piece);
- tidyBufFree(&tidy_buffer);
- continue;
- }
+ const NodeType node_type = tag_name_to_node_type(html_parser->tag_name);
+ if(node_type != (NodeType)-1)
+ parse_userdata->html_node.push({ node_type, "", "", 0 });
+ break;
+ }
+ case HTML_PARSE_TAG_END: {
+ if(!parse_userdata->html_node.empty()) {
+ const NodeType node_type = tag_name_to_node_type(html_parser->tag_name);
+ if(node_type != (NodeType)-1)
+ parse_userdata->html_node.pop();
+ }
+ break;
+ }
+ case HTML_PARSE_ATTRIBUTE: {
+ if(!parse_userdata->html_node.empty()) {
+ HtmlNode &html_node = parse_userdata->html_node.top();
+ if(html_parser->attribute_key.size == 5 && memcmp(html_parser->attribute_key.data, "class", 5) == 0) {
+ html_node.klass.assign(html_parser->attribute_value.data, html_parser->attribute_value.size);
+ } else if(html_parser->attribute_key.size == 4 && memcmp(html_parser->attribute_key.data, "href", 4) == 0) {
+ html_node.href.assign(html_parser->attribute_value.data, html_parser->attribute_value.size);
}
+ }
+ break;
+ }
+ case HTML_PARSE_TEXT: {
+ std::string text(html_parser->text.data, html_parser->text.size);
+ html_unescape_sequences(text);
- CommentPiece comment_piece;
+ CommentPiece comment_piece;
+ comment_piece.type = CommentPiece::Type::TEXT;
+ comment_piece.text = std::move(text);
+
+ if(parse_userdata->html_node.empty()) {
comment_piece.type = CommentPiece::Type::TEXT;
- // Warning: Cast from const char* to char* ...
- comment_piece.text = { (char*)inner_text, inner_text_size };
- callback(comment_piece);
+ } else {
+ HtmlNode &html_node = parse_userdata->html_node.top();
+ switch(html_node.node_type) {
+ case NodeType::A: {
+ if(html_node.klass == "quotelink") {
+ if(string_starts_with(html_node.href, "#p")) {
+ comment_piece.type = CommentPiece::Type::QUOTELINK;
+ comment_piece.quote_postnumber = strtoll(html_node.href.c_str() + 2, nullptr, 10);
+ } else if(string_starts_with(html_node.href, "/")) {
+ comment_piece.type = CommentPiece::Type::CROSSBOARD_LINK;
+ } else {
+ fprintf(stderr, "Unexpected href for quotelink: %s\n", html_node.href.c_str());
+ }
+ } else {
+ fprintf(stderr, "Unexpected class for a: %s\n", html_node.klass.c_str());
+ }
+ break;
+ }
+ case NodeType::SPAN: {
+ if(html_node.klass == "quote") {
+ comment_piece.type = html_node.output_count ? CommentPiece::Type::QUOTE : CommentPiece::Type::QUOTE_CONTINUE;
+ } else if(html_node.klass == "deadlink") {
+ comment_piece.type = CommentPiece::Type::DEADLINK;
+ } else {
+ fprintf(stderr, "Unexpected class for span: %s\n", html_node.klass.c_str());
+ }
+ break;
+ }
+ case NodeType::PRE: {
+ if(html_node.klass == "prettyprint") {
+ comment_piece.type = CommentPiece::Type::CODEBLOCK;
+ } else {
+ fprintf(stderr, "Unexpected class for pre: %s\n", html_node.klass.c_str());
+ }
+ break;
+ }
+ }
+ html_node.output_count++;
}
- tidyBufFree(&tidy_buffer);
- } else {
- int res = extract_comment_pieces(doc, child, callback);
- if(res != 0)
- return res;
+
+ parse_userdata->callback(comment_piece);
+ break;
+ }
+ case HTML_PARSE_JAVASCRIPT_CODE: {
+ break;
}
}
+
return 0;
}
static void extract_comment_pieces(const char *html_source, size_t size, CommentPieceCallback callback) {
- TidyDoc doc = tidyCreate();
- tidyOptSetBool(doc, TidyShowWarnings, no);
- tidyOptSetInt(doc, TidyUseCustomTags, 1);
- tidyOptSetInt(doc, TidyWrapLen, 0);
- if(tidyParseString(doc, html_source) < 0) {
- CommentPiece comment_piece;
- comment_piece.type = CommentPiece::Type::TEXT;
- // Warning: Cast from const char* to char* ...
- comment_piece.text = { (char*)html_source, size };
- callback(comment_piece);
- } else {
- extract_comment_pieces(doc, tidyGetRoot(doc), callback);
- }
- tidyRelease(doc);
+ HtmlParseUserdata parse_userdata;
+ parse_userdata.callback = std::move(callback);
+ html_parser_parse(html_source, size, html_parse_callback, &parse_userdata);
+ }
+
+ static std::string html_to_text(const char *html_source, size_t size, std::unordered_map<int64_t, size_t> &comment_by_postno, BodyItems &result_items, size_t body_item_index) {
+ std::string comment_text;
+ extract_comment_pieces(html_source, size,
+ [&comment_text, &comment_by_postno, &result_items, body_item_index](const CommentPiece &cp) {
+ switch(cp.type) {
+ case CommentPiece::Type::TEXT:
+ comment_text += std::move(cp.text);
+ break;
+ case CommentPiece::Type::QUOTE:
+ comment_text += std::move(cp.text);
+ break;
+ case CommentPiece::Type::QUOTE_CONTINUE:
+ comment_text += std::move(cp.text);
+ break;
+ case CommentPiece::Type::QUOTELINK: {
+ comment_text += std::move(cp.text);
+ auto it = comment_by_postno.find(cp.quote_postnumber);
+ if(it == comment_by_postno.end()) {
+ // TODO: Link this quote to a 4chan archive that still has the quoted comment (if available)
+ comment_text += " (Dead)";
+ } else {
+ result_items[body_item_index]->replies_to.push_back(it->second);
+ result_items[it->second]->replies.push_back(body_item_index);
+ }
+ break;
+ }
+ case CommentPiece::Type::DEADLINK:
+ // TODO: Link this quote to a 4chan archive that still has the quoted comment (if available)
+ comment_text += std::move(cp.text) + " (Dead)";
+ break;
+ case CommentPiece::Type::CROSSBOARD_LINK:
+ // TODO: Link this to another thread and allow navigating to it
+ comment_text += std::move(cp.text) + " (Cross-thread)";
+ break;
+ case CommentPiece::Type::CODEBLOCK:
+ // TODO: Use a different colored background and use a monospace font
+ comment_text += std::move(cp.text);
+ break;
+ }
+ });
+ return comment_text;
}
PluginResult FourchanBoardsPage::submit(const std::string &title, const std::string &url, std::vector<Tab> &result_tabs) {
@@ -264,68 +325,14 @@ namespace QuickMedia {
author_str += " #" + std::to_string(post_num.asInt64());
- std::string comment_text;
- extract_comment_pieces(sub_begin, sub_end - sub_begin,
- [&comment_text](const CommentPiece &cp) {
- switch(cp.type) {
- case CommentPiece::Type::TEXT:
- comment_text.append(cp.text.data, cp.text.size);
- break;
- case CommentPiece::Type::QUOTE:
- //comment_text += '>';
- //comment_text.append(cp.text.data, cp.text.size);
- //comment_text += '\n';
- break;
- case CommentPiece::Type::QUOTELINK: {
- comment_text.append(cp.text.data, cp.text.size);
- break;
- }
- case CommentPiece::Type::LINE_CONTINUE: {
- if(!comment_text.empty() && comment_text.back() == '\n') {
- comment_text.pop_back();
- }
- break;
- }
- }
- }
- );
+ std::string comment_text = html_to_text(sub_begin, sub_end - sub_begin, comment_by_postno, result_items, body_item_index);
if(!comment_text.empty())
comment_text += '\n';
- extract_comment_pieces(comment_begin, comment_end - comment_begin,
- [&comment_text, &comment_by_postno, &result_items, body_item_index](const CommentPiece &cp) {
- switch(cp.type) {
- case CommentPiece::Type::TEXT:
- comment_text.append(cp.text.data, cp.text.size);
- break;
- case CommentPiece::Type::QUOTE:
- //comment_text += '>';
- //comment_text.append(cp.text.data, cp.text.size);
- //comment_text += '\n';
- break;
- case CommentPiece::Type::QUOTELINK: {
- comment_text.append(cp.text.data, cp.text.size);
- auto it = comment_by_postno.find(cp.quote_postnumber);
- if(it == comment_by_postno.end()) {
- // TODO: Link this quote to a 4chan archive that still has the quoted comment (if available)
- comment_text += "(dead)";
- } else {
- result_items[body_item_index]->replies_to.push_back(it->second);
- result_items[it->second]->replies.push_back(body_item_index);
- }
- break;
- }
- case CommentPiece::Type::LINE_CONTINUE: {
- if(!comment_text.empty() && comment_text.back() == '\n') {
- comment_text.pop_back();
- }
- break;
- }
- }
- }
- );
+
+ comment_text += html_to_text(comment_begin, comment_end - comment_begin, comment_by_postno, result_items, body_item_index);
if(!comment_text.empty() && comment_text.back() == '\n')
- comment_text.back() = ' ';
- html_unescape_sequences(comment_text);
+ comment_text.pop_back();
+
BodyItem *body_item = result_items[body_item_index].get();
body_item->set_title(std::move(comment_text));
body_item->set_author(std::move(author_str));
@@ -369,6 +376,7 @@ namespace QuickMedia {
if(!json_root.isArray())
return PluginResult::ERR;
+ std::unordered_map<int64_t, size_t> comment_by_postno;
for(const Json::Value &page_data : json_root) {
if(!page_data.isObject())
continue;
@@ -395,61 +403,11 @@ namespace QuickMedia {
if(!thread_num.isNumeric())
continue;
- std::string title_text;
- extract_comment_pieces(sub_begin, sub_end - sub_begin,
- [&title_text](const CommentPiece &cp) {
- switch(cp.type) {
- case CommentPiece::Type::TEXT:
- title_text.append(cp.text.data, cp.text.size);
- break;
- case CommentPiece::Type::QUOTE:
- //title_text += '>';
- //title_text.append(cp.text.data, cp.text.size);
- //comment_text += '\n';
- break;
- case CommentPiece::Type::QUOTELINK: {
- title_text.append(cp.text.data, cp.text.size);
- break;
- }
- case CommentPiece::Type::LINE_CONTINUE: {
- if(!title_text.empty() && title_text.back() == '\n') {
- title_text.pop_back();
- }
- break;
- }
- }
- }
- );
+ std::string title_text = html_to_text(sub_begin, sub_end - sub_begin, comment_by_postno, result_items, 0);
if(!title_text.empty() && title_text.back() == '\n')
title_text.back() = ' ';
- html_unescape_sequences(title_text);
-
- std::string comment_text;
- extract_comment_pieces(comment_begin, comment_end - comment_begin,
- [&comment_text](const CommentPiece &cp) {
- switch(cp.type) {
- case CommentPiece::Type::TEXT:
- comment_text.append(cp.text.data, cp.text.size);
- break;
- case CommentPiece::Type::QUOTE:
- //comment_text += '>';
- //comment_text.append(cp.text.data, cp.text.size);
- //comment_text += '\n';
- break;
- case CommentPiece::Type::QUOTELINK: {
- comment_text.append(cp.text.data, cp.text.size);
- break;
- }
- case CommentPiece::Type::LINE_CONTINUE: {
- if(!comment_text.empty() && comment_text.back() == '\n') {
- comment_text.pop_back();
- }
- break;
- }
- }
- }
- );
- html_unescape_sequences(comment_text);
+
+ std::string comment_text = html_to_text(comment_begin, comment_end - comment_begin, comment_by_postno, result_items, 0);
// TODO: Do the same when wrapping is implemented
// TODO: Remove this
int num_lines = 0;
@@ -462,6 +420,7 @@ namespace QuickMedia {
}
}
}
+
auto body_item = BodyItem::create(std::move(comment_text));
body_item->set_author(std::move(title_text));
body_item->url = std::to_string(thread_num.asInt64());