From 5c1b4fb3198ec398e8b292effe5ca84d280939f3 Mon Sep 17 00:00:00 2001 From: dec05eba Date: Mon, 19 Oct 2020 01:58:41 +0200 Subject: Ph: fix related videos images and titles missing or incorrect. Use message type to open in video player instead of using http head --- src/plugins/Pornhub.cpp | 211 +++++++++++++++++++++++++----------------------- 1 file changed, 112 insertions(+), 99 deletions(-) (limited to 'src/plugins/Pornhub.cpp') diff --git a/src/plugins/Pornhub.cpp b/src/plugins/Pornhub.cpp index e8df9d7..b063a32 100644 --- a/src/plugins/Pornhub.cpp +++ b/src/plugins/Pornhub.cpp @@ -1,7 +1,9 @@ #include "../../plugins/Pornhub.hpp" #include "../../include/StringUtils.hpp" #include "../../include/NetUtils.hpp" -#include +extern "C" { +#include +} #include namespace QuickMedia { @@ -9,64 +11,124 @@ namespace QuickMedia { return strncmp(str, begin_with, strlen(begin_with)) == 0; } - static bool contains(const char *str, const char *substr) { - return strstr(str, substr); + // TODO: Optimize by using HtmlStringView instead of std::string + struct HtmlElement { + std::string tag_name; + std::map attributes; + std::vector children; + HtmlElement *parent = nullptr; // ref + }; + + static void html_cleanup(HtmlElement *html_element_root) { + for(HtmlElement *child_html_element : html_element_root->children) { + html_cleanup(child_html_element); + } + delete html_element_root; } - SearchResult PornhubSearchPage::search(const std::string &str, BodyItems &result_items) { - std::string url = "https://www.pornhub.com/video/search?search="; - url += url_param_encode(str); + static const std::string& html_get_attribute_or(HtmlElement *html_element, const std::string &attr_key, const std::string &default_value) { + auto it = html_element->attributes.find(attr_key); + if(it != html_element->attributes.end()) + return it->second; + else + return default_value; + } + + struct HtmlParseUserdata { + HtmlElement *current_html_element; + }; + + static void html_page_callback(HtmlParser *html_parser, HtmlParseType parse_type, void *userdata) { + HtmlParseUserdata *parse_userdata = (HtmlParseUserdata*)userdata; + if(parse_type == HTML_PARSE_TAG_START) { + auto new_html_element = new HtmlElement(); + new_html_element->tag_name.assign(html_parser->tag_name.data, html_parser->tag_name.size); + new_html_element->parent = parse_userdata->current_html_element; + + parse_userdata->current_html_element->children.push_back(new_html_element); + parse_userdata->current_html_element = new_html_element; + } else if(parse_type == HTML_PARSE_TAG_END) { + if(parse_userdata->current_html_element->parent) + parse_userdata->current_html_element = parse_userdata->current_html_element->parent; + } else if(parse_type == HTML_PARSE_ATTRIBUTE) { + std::string attr_key(html_parser->attribute_key.data, html_parser->attribute_key.size); + std::string attr_value(html_parser->attribute_value.data, html_parser->attribute_value.size); + parse_userdata->current_html_element->attributes.insert(std::make_pair(std::move(attr_key), std::move(attr_value))); + } + } + + static HtmlElement* html_parse(char *source, size_t size) { + HtmlElement *html_element_root = new HtmlElement(); + HtmlParseUserdata parse_userdata; + parse_userdata.current_html_element = html_element_root; + HtmlParser html_parser; + html_parser_init(&html_parser, source, size, html_page_callback, &parse_userdata); + html_parser_parse(&html_parser); + html_parser_deinit(&html_parser); + return html_element_root; + } + + using HtmlFindTagsCallback = std::function; + static void html_find_tags_with_class(HtmlElement *html_element, const std::string &tag_name, const std::string &class_value, const HtmlFindTagsCallback &callback) { + if(html_element->tag_name == tag_name) { + if(html_get_attribute_or(html_element, "class", "") == class_value) + callback(html_element); + } + for(HtmlElement *child_html_element : html_element->children) { + html_find_tags_with_class(child_html_element, tag_name, class_value, callback); + } + } + + static void html_find_tags(HtmlElement *html_element, const std::string &tag_name, const HtmlFindTagsCallback &callback) { + if(html_element->tag_name == tag_name) + callback(html_element); + for(HtmlElement *child_html_element : html_element->children) { + html_find_tags(child_html_element, tag_name, callback); + } + } + static SearchResult get_videos_in_page(const std::string &url, bool use_tor, BodyItems &result_items) { std::string website_data; - if(download_to_string(url, website_data, {}, is_tor_enabled()) != DownloadResult::OK) + if(download_to_string(url, website_data, {}, use_tor) != DownloadResult::OK) return SearchResult::NET_ERR; - struct ItemData { - BodyItems *result_items; - size_t index; - }; - ItemData item_data = { &result_items, 0 }; - - QuickMediaHtmlSearch html_search; - int result = quickmedia_html_search_init(&html_search, website_data.c_str()); - if(result != 0) - goto cleanup; - - result = quickmedia_html_find_nodes_xpath(&html_search, "//div[class='phimage']//a", - [](QuickMediaHtmlNode *node, void *userdata) { - auto *result_items = (BodyItems*)userdata; - const char *href = quickmedia_html_node_get_attribute_value(node, "href"); - const char *title = quickmedia_html_node_get_attribute_value(node, "title"); - if(href && title && begins_with(href, "/view_video.php?viewkey")) { - auto item = BodyItem::create(strip(title)); - item->url = std::string("https://www.pornhub.com") + href; - result_items->push_back(std::move(item)); - } - }, &result_items); - if(result != 0) - goto cleanup; - - result = quickmedia_html_find_nodes_xpath(&html_search, "//div[class='phimage']//img", - [](QuickMediaHtmlNode *node, void *userdata) { - ItemData *item_data = (ItemData*)userdata; - if(item_data->index >= item_data->result_items->size()) - return; - - const char *data_src = quickmedia_html_node_get_attribute_value(node, "data-src"); - if(data_src && contains(data_src, "phncdn.com/videos")) { - (*item_data->result_items)[item_data->index]->thumbnail_url = data_src; - ++item_data->index; - } - }, &item_data); + HtmlElement *html_root = html_parse(website_data.data(), website_data.size()); + html_find_tags_with_class(html_root, "div", "phimage", [&result_items](HtmlElement *html_element) { + auto it = html_element->attributes.find("data-entrycode"); + if(it == html_element->attributes.end() || it->second != "VidPg-premVid-videoPage") { + html_find_tags(html_element, "a", [&result_items](HtmlElement *html_element) { + const std::string &href = html_get_attribute_or(html_element, "href", ""); + const std::string &title = html_get_attribute_or(html_element, "title", ""); + if(!href.empty() && !title.empty() && begins_with(href.c_str(), "/view_video.php?viewkey")) { + std::string title_fixed = strip(title); + html_unescape_sequences(title_fixed); + auto item = BodyItem::create(std::move(title_fixed)); + item->url = std::string("https://www.pornhub.com") + href; + item->thumbnail_size = sf::Vector2i(192, 108); + result_items.push_back(std::move(item)); + + html_find_tags(html_element, "img", [&result_items](HtmlElement *html_element) { + const std::string &src = html_get_attribute_or(html_element, "data-src", ""); + if(src.find("phncdn.com/videos") != std::string::npos) + result_items.back()->thumbnail_url = src; + }); + } + }); + } + }); + html_cleanup(html_root); // Attempt to skip promoted videos (that are not related to the search term) - if(result_items.size() >= 4) { + if(result_items.size() >= 4) result_items.erase(result_items.begin(), result_items.begin() + 4); - } - cleanup: - quickmedia_html_search_deinit(&html_search); - return result == 0 ? SearchResult::OK : SearchResult::ERR; + return SearchResult::OK; + } + + SearchResult PornhubSearchPage::search(const std::string &str, BodyItems &result_items) { + std::string url = "https://www.pornhub.com/video/search?search="; + url += url_param_encode(str); + return get_videos_in_page(url, is_tor_enabled(), result_items); } PluginResult PornhubSearchPage::submit(const std::string &title, const std::string &url, std::vector &result_tabs) { @@ -78,56 +140,7 @@ namespace QuickMedia { BodyItems PornhubVideoPage::get_related_media(const std::string &url) { BodyItems result_items; - - std::string website_data; - if(download_to_string(url, website_data, {}, is_tor_enabled()) != DownloadResult::OK) - return result_items; - - struct ItemData { - BodyItems *result_items; - size_t index; - }; - ItemData item_data = { &result_items, 0 }; - - QuickMediaHtmlSearch html_search; - int result = quickmedia_html_search_init(&html_search, website_data.c_str()); - if(result != 0) - goto cleanup; - - result = quickmedia_html_find_nodes_xpath(&html_search, "//div[class='phimage']//a", - [](QuickMediaHtmlNode *node, void *userdata) { - auto *result_items = (BodyItems*)userdata; - const char *href = quickmedia_html_node_get_attribute_value(node, "href"); - const char *title = quickmedia_html_node_get_attribute_value(node, "title"); - if(href && title && begins_with(href, "/view_video.php?viewkey")) { - auto item = BodyItem::create(strip(title)); - item->url = std::string("https://www.pornhub.com") + href; - result_items->push_back(std::move(item)); - } - }, &result_items); - if(result != 0) - goto cleanup; - - result = quickmedia_html_find_nodes_xpath(&html_search, "//div[class='phimage']//img", - [](QuickMediaHtmlNode *node, void *userdata) { - ItemData *item_data = (ItemData*)userdata; - if(item_data->index >= item_data->result_items->size()) - return; - - const char *src = quickmedia_html_node_get_attribute_value(node, "src"); - if(src && contains(src, "phncdn.com/videos")) { - (*item_data->result_items)[item_data->index]->thumbnail_url = src; - ++item_data->index; - } - }, &item_data); - - // Attempt to skip promoted videos (that are not related to the search term) - if(result_items.size() >= 4) { - result_items.erase(result_items.begin(), result_items.begin() + 4); - } - - cleanup: - quickmedia_html_search_deinit(&html_search); + get_videos_in_page(url, is_tor_enabled(), result_items); return result_items; } } \ No newline at end of file -- cgit v1.2.3