From a1ca82847eb356c6b85ada2ac11f38d98f6e085e Mon Sep 17 00:00:00 2001 From: dec05eba Date: Mon, 13 Jul 2020 15:57:10 +0200 Subject: Start on add_rss, add rss parser --- rss.c | 176 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 176 insertions(+) create mode 100644 rss.c (limited to 'rss.c') diff --git a/rss.c b/rss.c new file mode 100644 index 0000000..fdb932c --- /dev/null +++ b/rss.c @@ -0,0 +1,176 @@ +#include "rss.h" +#include "download.h" +#include "buffer.h" +#include +#include +#include + +static int is_alpha_lowercase(char c) { + return c >= 'a' && c <= 'z'; +} + +static int is_digit(char c) { + return c >= '0' && c <= '9'; +} + +static char* get_amp_end(char *str) { + for(;;) { + char c = *str; + if(is_alpha_lowercase(c) || is_digit(c) || c == '#') + ++str; + else if(c == ';' || c == '\0') + break; + } + return str; +} + +static void xml_unescape(char *str, char *result, int result_length) { + int index = 0; + for(;;) { + char c = *str; + if(c == '&') { + char *amp_end = get_amp_end(str + 1); + char prev_char = *amp_end; + *amp_end = '\0'; + + if(str[1] == '#') { + result[index++] = atoi(str + 2); + } else { + if(strcmp(str + 1, "amp") == 0) + result[index++] = '&'; + else if(strcmp(str + 1, "lt") == 0) + result[index++] = '<'; + else if(strcmp(str + 1, "gt") == 0) + result[index++] = '>'; + else if(strcmp(str + 1, "apos") == 0) + result[index++] = '\''; + } + + *amp_end = prev_char; + str = amp_end; + if(prev_char != '\0') + ++str; + } else if(c == '\0') { + result[index] = '\0'; + break; + } else { + result[index++] = c; + ++str; + } + + if(index == result_length - 1) { + result[index] = '\0'; + break; + } + } +} + +static char* string_substr_before_tag_end(char *str, const char *tag) { + char *tag_p = strstr(str, tag); + if(tag_p) + *tag_p = '\0'; + return tag_p; +} + +typedef void (*RssParseCallback)(const char *title, const char *link, void *userdata); + +static int parse_rss(char *str, char *rss_title_str, int rss_title_str_size, RssParseCallback parse_callback, void *userdata) { + char *channel_start = strstr(str, ""); + if(!channel_start) + return 1; + + char *after_channel = channel_start + 9; + + char *rss_title = strstr(after_channel, ""); + char *first_item = strstr(after_channel, "<item>"); + if(!first_item) { + rss_title += 7; + string_substr_before_tag_end(rss_title, ""); + xml_unescape(rss_title, rss_title_str, rss_title_str_size); + return 0; + } + + if(rss_title < first_item) { + rss_title += 7; + string_substr_before_tag_end(rss_title, ""); + xml_unescape(rss_title, rss_title_str, rss_title_str_size); + } else { + rss_title_str[0] = '\0'; + } + + char title_str[256]; + char link_str[2084]; + + char *item = first_item; + for(;;) { + char *after_first_item = item + 6; + char *item_end = strstr(after_first_item, ""); + if(!item_end) + return 1; + + char *item_title = strstr(after_first_item, ""); + if(!item_title) + return 1; + + if(item_title >= item_end) + return 1; + + item_title += 7; + char *after_title = string_substr_before_tag_end(item_title, ""); + if(!after_title) + return 1; + + after_title += 8; + char *item_link = strstr(after_title, ""); + if(!item_link) + return 1; + + if(item_link >= item_end) + return 1; + + item_link += 6; + string_substr_before_tag_end(item_link, ""); + + xml_unescape(item_title, title_str, sizeof(title_str)); + xml_unescape(item_link, link_str, sizeof(link_str)); + parse_callback(title_str, link_str, userdata); + + item = strstr(item_end + 7, ""); + if(!item) + return 0; + } +} + +static void rss_parse_callback(const char *title, const char *link, void *userdata) { + (void)userdata; + fprintf(stderr, "title: |%s|, link: |%s|\n", title, link); +} + +int add_rss(const char *name, const char *url, const char *rss_config_dir, const char *start_after) { + (void)name; + (void)rss_config_dir; + (void)start_after; + int result = 0; + + Buffer buffer; + buffer_init(&buffer); + int res = download_to_buffer(url, &buffer); + if(res != 0) { + fprintf(stderr, "Failed to download rss: %s\n", url); + result = res; + goto cleanup; + } + + char rss_title[256]; + res = parse_rss(buffer.data, rss_title, sizeof(rss_title), rss_parse_callback, NULL); + if(res != 0) { + fprintf(stderr, "Failed to parse rss for url: %s\n", url); + result = res; + goto cleanup; + } + fprintf(stderr, "rss title: |%s|\n", rss_title); + + cleanup: + buffer_deinit(&buffer); + return result; +} -- cgit v1.2.3