aboutsummaryrefslogtreecommitdiff
path: root/src/plugins/Fourchan.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/plugins/Fourchan.cpp')
-rw-r--r--src/plugins/Fourchan.cpp304
1 files changed, 186 insertions, 118 deletions
diff --git a/src/plugins/Fourchan.cpp b/src/plugins/Fourchan.cpp
index 01039b9..5a8cada 100644
--- a/src/plugins/Fourchan.cpp
+++ b/src/plugins/Fourchan.cpp
@@ -1,6 +1,7 @@
#include "../../plugins/Fourchan.hpp"
#include <json/reader.h>
#include <string.h>
+#include "../../include/DataView.hpp"
#include <tidy.h>
#include <tidybuffio.h>
@@ -348,11 +349,115 @@ namespace QuickMedia {
}
static bool string_ends_with(const std::string &str, const std::string &ends_with_str) {
- size_t len = ends_with_str.size();
- return len == 0 || (str.size() >= len && memcmp(&str[str.size() - len], ends_with_str.data(), len) == 0);
+ size_t ends_len = ends_with_str.size();
+ return ends_len == 0 || (str.size() >= ends_len && memcmp(&str[str.size() - ends_len], ends_with_str.data(), ends_len) == 0);
}
- PluginResult Fourchan::get_content_list(const std::string &url, BodyItems &result_items) {
+ struct CommentPiece {
+ enum class Type {
+ TEXT,
+ QUOTE, // >
+ QUOTELINK, // >>POSTNO,
+ LINEBREAK
+ };
+
+ DataView text; // Set when type is TEXT, QUOTE or QUOTELINK
+ int64_t quote_postnumber; // Set when type is QUOTELINK
+ Type type;
+ };
+
+ static TidyAttr get_attribute_by_name(TidyNode node, const char *name) {
+ for(TidyAttr attr = tidyAttrFirst(node); attr; attr = tidyAttrNext(attr)) {
+ const char *attr_name = tidyAttrName(attr);
+ if(attr_name && strcmp(name, attr_name) == 0)
+ return attr;
+ }
+ return nullptr;
+ }
+
+ static const char* get_attribute_value(TidyNode node, const char *name) {
+ TidyAttr attr = get_attribute_by_name(node, name);
+ if(attr)
+ return tidyAttrValue(attr);
+ return nullptr;
+ }
+
+ using CommentPieceCallback = std::function<void(const CommentPiece&)>;
+ static void extract_comment_pieces(TidyDoc doc, TidyNode node, CommentPieceCallback callback) {
+ for(TidyNode child = tidyGetChild(node); child; child = tidyGetNext(child)) {
+ //extract_comment_pieces(doc, child, callback);
+ const char *node_name = tidyNodeGetName(child);
+ TidyNodeType node_type = tidyNodeGetType(child);
+ if(node_type == TidyNode_Start && node_name) {
+ TidyNode text_node = tidyGetChild(child);
+ //fprintf(stderr, "Child node name: %s, child text type: %d\n", node_name, tidyNodeGetType(text_node));
+ if(tidyNodeGetType(text_node) == TidyNode_Text) {
+ TidyBuffer tidy_buffer;
+ tidyBufInit(&tidy_buffer);
+ if(tidyNodeGetText(doc, text_node, &tidy_buffer)) {
+ CommentPiece comment_piece;
+ comment_piece.type = CommentPiece::Type::TEXT;
+ comment_piece.text = { (char*)tidy_buffer.bp, tidy_buffer.size };
+ if(strcmp(node_name, "span") == 0) {
+ const char *span_class = get_attribute_value(child, "class");
+ //fprintf(stderr, "span class: %s\n", span_class);
+ if(span_class && strcmp(span_class, "quote") == 0)
+ comment_piece.type = CommentPiece::Type::QUOTE;
+ } else if(strcmp(node_name, "a") == 0) {
+ const char *a_class = get_attribute_value(child, "class");
+ const char *a_href = get_attribute_value(child, "href");
+ //fprintf(stderr, "a class: %s, href: %s\n", a_class, a_href);
+ if(a_class && a_href && strcmp(a_class, "quotelink") == 0 && strncmp(a_href, "#p", 2) == 0) {
+ comment_piece.type = CommentPiece::Type::QUOTELINK;
+ comment_piece.quote_postnumber = strtoll(a_href + 2, nullptr, 10);
+ }
+ }
+ /*
+ for(size_t i = 0; i < comment_piece.text.size; ++i) {
+ if(comment_piece.text.data[i] == '\n')
+ comment_piece.text.data[i] = ' ';
+ }
+ */
+ callback(comment_piece);
+ }
+ tidyBufFree(&tidy_buffer);
+ }
+ } else if(node_type == TidyNode_Text) {
+ TidyBuffer tidy_buffer;
+ tidyBufInit(&tidy_buffer);
+ if(tidyNodeGetText(doc, child, &tidy_buffer)) {
+ CommentPiece comment_piece;
+ comment_piece.type = CommentPiece::Type::TEXT;
+ comment_piece.text = { (char*)tidy_buffer.bp, tidy_buffer.size };
+ /*
+ for(size_t i = 0; i < comment_piece.text.size; ++i) {
+ if(comment_piece.text.data[i] == '\n')
+ comment_piece.text.data[i] = ' ';
+ }
+ */
+ callback(comment_piece);
+ }
+ tidyBufFree(&tidy_buffer);
+ }
+ }
+ }
+
+ static void extract_comment_pieces(const char *html_source, size_t size, CommentPieceCallback callback) {
+ TidyDoc doc = tidyCreate();
+ tidyOptSetBool(doc, TidyShowWarnings, no);
+ if(tidyParseString(doc, html_source) < 0) {
+ CommentPiece comment_piece;
+ comment_piece.type = CommentPiece::Type::TEXT;
+ // Warning: Cast from const char* to char* ...
+ comment_piece.text = { (char*)html_source, size };
+ callback(comment_piece);
+ } else {
+ extract_comment_pieces(doc, tidyGetBody(doc), std::move(callback));
+ }
+ tidyRelease(doc);
+ }
+
+ PluginResult Fourchan::get_threads(const std::string &url, BodyItems &result_items) {
std::string server_response;
if(download_to_string(fourchan_url + url + "/catalog.json", server_response) != DownloadResult::OK)
return PluginResult::NET_ERR;
@@ -380,14 +485,38 @@ namespace QuickMedia {
continue;
const Json::Value &com = thread["com"];
- if(!com.isString())
- continue;
+ const char *comment_begin = "";
+ const char *comment_end = comment_begin;
+ com.getString(&comment_begin, &comment_end);
const Json::Value &thread_num = thread["no"];
if(!thread_num.isNumeric())
continue;
- auto body_item = std::make_unique<BodyItem>(com.asString());
+ std::string comment_text;
+ extract_comment_pieces(comment_begin, comment_end - comment_begin,
+ [&comment_text](const CommentPiece &cp) {
+ switch(cp.type) {
+ case CommentPiece::Type::TEXT:
+ comment_text.append(cp.text.data, cp.text.size);
+ break;
+ case CommentPiece::Type::QUOTE:
+ comment_text += '>';
+ comment_text.append(cp.text.data, cp.text.size);
+ //comment_text += '\n';
+ break;
+ case CommentPiece::Type::QUOTELINK: {
+ comment_text.append(cp.text.data, cp.text.size);
+ break;
+ }
+ case CommentPiece::Type::LINEBREAK:
+ // comment_text += '\n';
+ break;
+ }
+ }
+ );
+ html_unescape_sequences(comment_text);
+ auto body_item = std::make_unique<BodyItem>(std::move(comment_text));
body_item->url = std::to_string(thread_num.asInt64());
const Json::Value &ext = thread["ext"];
@@ -411,95 +540,7 @@ namespace QuickMedia {
return PluginResult::OK;
}
- struct CommentPiece {
- enum class Type {
- TEXT,
- QUOTE, // >
- QUOTELINK, // >>POSTNO,
- LINEBREAK
- };
-
- std::string_view text; // Set when type is TEXT, QUOTE or QUOTELINK
- int64_t quote_postnumber; // Set when type is QUOTELINK
- Type type;
- };
-
- static TidyAttr get_attribute_by_name(TidyNode node, const char *name) {
- for(TidyAttr attr = tidyAttrFirst(node); attr; attr = tidyAttrNext(attr)) {
- const char *attr_name = tidyAttrName(attr);
- if(attr_name && strcmp(name, attr_name) == 0)
- return attr;
- }
- return nullptr;
- }
-
- static const char* get_attribute_value(TidyNode node, const char *name) {
- TidyAttr attr = get_attribute_by_name(node, name);
- if(attr)
- return tidyAttrValue(attr);
- return nullptr;
- }
-
- using CommentPieceCallback = std::function<void(const CommentPiece&)>;
- static void extract_comment_pieces(TidyDoc doc, TidyNode node, CommentPieceCallback callback) {
- const char *node_name = tidyNodeGetName(node);
- printf("node name: %s\n", node_name ? node_name : "N/A");
-
- TidyNodeType node_type = tidyNodeGetType(node);
- if(node_type == TidyNode_Text) {
- TidyBuffer tidy_buffer;
- tidyBufInit(&tidy_buffer);
- if(tidyNodeGetText(doc, node, &tidy_buffer)) {
- CommentPiece comment_piece;
- comment_piece.type = CommentPiece::Type::TEXT;
- if(node_name) {
- if(strncmp("a", (const char*)tidy_buffer.bp, tidy_buffer.size) == 0) {
- const char *a_href = get_attribute_value(node, "href");
- if(a_href && strncmp(a_href, "#p", 2) == 0) {
- comment_piece.type = CommentPiece::Type::QUOTELINK;
- comment_piece.quote_postnumber = strtoll(a_href + 2, nullptr, 10);
- }
- } else if(strncmp("span", (const char*)tidy_buffer.bp, tidy_buffer.size) == 0) {
- const char *a_class = get_attribute_value(node, "class");
- if(a_class && strcmp(a_class, "quote") == 0) {
- comment_piece.type = CommentPiece::Type::QUOTE;
- comment_piece.text = { (const char*)tidy_buffer.bp, tidy_buffer.size };
- }
- }
- }
- printf("Comment piece value: %.*s\n", tidy_buffer.size, tidy_buffer.bp);
- comment_piece.text = { (const char*)tidy_buffer.bp, tidy_buffer.size };
- callback(comment_piece);
- }
- tidyBufFree(&tidy_buffer);
- } else if(node_name) {
- if(strcmp("br", node_name) == 0) {
- CommentPiece comment_piece;
- comment_piece.type = CommentPiece::Type::LINEBREAK;
- callback(comment_piece);
- }
- }
-
- for(TidyNode child = tidyGetChild(node); child; child = tidyGetNext(child)) {
- extract_comment_pieces(doc, child, callback);
- }
- }
-
- static void extract_comment_pieces(const char *html_source, size_t size, CommentPieceCallback callback) {
- TidyDoc doc = tidyCreate();
- tidyOptSetBool(doc, TidyShowWarnings, no);
- if(tidyParseString(doc, html_source) < 0) {
- CommentPiece comment_piece;
- comment_piece.type = CommentPiece::Type::TEXT;
- comment_piece.text = { html_source, size };
- callback(comment_piece);
- } else {
- extract_comment_pieces(doc, tidyGetBody(doc), std::move(callback));
- }
- tidyRelease(doc);
- }
-
- PluginResult Fourchan::get_content_details(const std::string &list_url, const std::string &url, BodyItems &result_items) {
+ PluginResult Fourchan::get_thread_comments(const std::string &list_url, const std::string &url, BodyItems &result_items) {
std::string server_response;
if(download_to_string(fourchan_url + list_url + "/thread/" + url + ".json", server_response) != DownloadResult::OK)
return PluginResult::NET_ERR;
@@ -513,43 +554,70 @@ namespace QuickMedia {
return PluginResult::ERR;
}
+ std::unordered_map<int64_t, size_t> comment_by_postno;
+
const Json::Value &posts = json_root["posts"];
if(posts.isArray()) {
for(const Json::Value &post : posts) {
if(!post.isObject())
continue;
- const Json::Value &com = post["com"];
- const char *comment_begin;
- const char *comment_end;
- if(!com.getString(&comment_begin, &comment_end))
+ const Json::Value &post_num = post["no"];
+ if(!post_num.isNumeric())
continue;
+
+ comment_by_postno[post_num.asInt64()] = result_items.size();
+ result_items.push_back(std::make_unique<BodyItem>(""));
+ }
+ }
+
+ size_t body_item_index = 0;
+ if(posts.isArray()) {
+ for(const Json::Value &post : posts) {
+ if(!post.isObject())
+ continue;
+
+ const Json::Value &com = post["com"];
+ const char *comment_begin = "";
+ const char *comment_end = comment_begin;
+ com.getString(&comment_begin, &comment_end);
const Json::Value &post_num = post["no"];
if(!post_num.isNumeric())
continue;
std::string comment_text;
- extract_comment_pieces(comment_begin, comment_end - comment_begin, [&comment_text](const CommentPiece &cp) {
- switch(cp.type) {
- case CommentPiece::Type::TEXT:
- comment_text += cp.text;
- break;
- case CommentPiece::Type::QUOTE:
- comment_text += '>';
- comment_text += cp.text;
- break;
- case CommentPiece::Type::QUOTELINK:
- comment_text += cp.text;
- break;
- case CommentPiece::Type::LINEBREAK:
- // comment_text += '\n';
- break;
+ extract_comment_pieces(comment_begin, comment_end - comment_begin,
+ [&comment_text, &comment_by_postno, &result_items, body_item_index](const CommentPiece &cp) {
+ switch(cp.type) {
+ case CommentPiece::Type::TEXT:
+ comment_text.append(cp.text.data, cp.text.size);
+ break;
+ case CommentPiece::Type::QUOTE:
+ comment_text += '>';
+ comment_text.append(cp.text.data, cp.text.size);
+ //comment_text += '\n';
+ break;
+ case CommentPiece::Type::QUOTELINK: {
+ comment_text.append(cp.text.data, cp.text.size);
+ auto it = comment_by_postno.find(cp.quote_postnumber);
+ if(it == comment_by_postno.end()) {
+ // TODO: Link this quote to a 4chan archive that still has the quoted comment (if available)
+ comment_text += "(dead)";
+ } else {
+ result_items[it->second]->replies.push_back(body_item_index);
+ }
+ break;
+ }
+ case CommentPiece::Type::LINEBREAK:
+ // comment_text += '\n';
+ break;
+ }
}
- });
+ );
html_unescape_sequences(comment_text);
- auto body_item = std::make_unique<BodyItem>(std::move(comment_text));
- body_item->url = std::to_string(post_num.asInt64());
+ BodyItem *body_item = result_items[body_item_index].get();
+ body_item->title = std::move(comment_text);
const Json::Value &ext = post["ext"];
const Json::Value &tim = post["tim"];
@@ -564,7 +632,7 @@ namespace QuickMedia {
body_item->thumbnail_url = fourchan_image_url + list_url + "/" + std::to_string(tim.asInt64()) + "s.jpg";
}
- result_items.emplace_back(std::move(body_item));
+ ++body_item_index;
}
}