Initial commit, copied from wwwhtml-parser. Works. Need to add unescape of html sequences

author: DEC05EBA <dec05eba@protonmail.com> 2019-12-31 08:46:05 +0100
committer: DEC05EBA <dec05eba@protonmail.com> 2019-12-31 08:49:12 +0100
commit: ac0a3e0ebb9b460a31a76115cb4d494361c03e49 (patch)
tree: 50cbb9a689934792372eb998d3f2653776194829
7 files changed, 519 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..636c6b9
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,5 @@
+# Compiled sibs files
+sibs-build/
+compile_commands.json
+tests/sibs-build/
+tests/compile_commands.json
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..b27915c
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,14 @@
+Copyright (C) 2019  DEC05EBA
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <https://www.gnu.org/licenses/>.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..b90e48b
--- /dev/null
+++ b/README.md
@@ -0,0 +1 @@
+A small html parser written in C. The parser fixes broken html (missing end tags). The parser doesn't perform any dynamic allocations (heap) and only outputs the parsing result to a callback function rather than a dom tree.
diff --git a/include/HtmlParser.h b/include/HtmlParser.h
new file mode 100644
index 0000000..72de123
--- /dev/null
+++ b/include/HtmlParser.h
@@ -0,0 +1,54 @@
+#ifndef HTML_PARSER_H
+#define HTML_PARSER_H
+
+/*
+    A small html parser with no dependencies and doesn't dynamically allocate any memory
+    and can parse and repair broken html (just like web browsers)
+*/
+
+#include <stddef.h>
+
+typedef struct {
+    const char *data;
+    size_t size;
+} StringView;
+
+typedef struct HtmlParser HtmlParser;
+
+typedef enum{
+    HTML_PARSE_TAG_START,
+    HTML_PARSE_TAG_END,
+    HTML_PARSE_ATTRIBUTE,
+    HTML_PARSE_TEXT,
+    HTML_PARSE_JAVASCRIPT_CODE
+} HtmlParseType;
+
+typedef void (*HtmlParseCallback)(HtmlParser *html_parser, HtmlParseType parse_type, void *userdata);
+
+#define UNCLOSED_TAGS_SIZE 2048
+
+struct HtmlParser {
+    const char *source;
+    size_t source_len;
+    size_t offset;
+    HtmlParseCallback parse_callback;
+    void *callback_userdata;
+
+    StringView tag_name;
+    StringView attribute_key;
+    StringView attribute_value;
+    StringView text;
+
+    int is_tag_void;
+    int inside_script_tag;
+
+    size_t unclosed_tags_offset;
+    StringView unclosed_tags[UNCLOSED_TAGS_SIZE];
+};
+
+void html_parser_init(HtmlParser *self, const char *html_source, size_t len, HtmlParseCallback parse_callback, void *userdata);
+void html_parser_deinit(HtmlParser *self);
+
+void html_parser_parse(HtmlParser *self);
+
+#endif /* HTML_PARSER_H */
+\ No newline at end of file
diff --git a/project.conf b/project.conf
new file mode 100644
index 0000000..84f91f0
--- /dev/null
+++ b/project.conf
@@ -0,0 +1,5 @@
+[package]
+name = "html-parser"
+type = "static"
+version = "0.1.0"
+platforms = ["any"]
diff --git a/src/HtmlParser.c b/src/HtmlParser.c
new file mode 100644
index 0000000..a1b62cd
--- /dev/null
+++ b/src/HtmlParser.c
@@ -0,0 +1,434 @@
+#include "../include/HtmlParser.h"
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+
+static StringView void_tags[] = {
+    {.data = "area", .size = 4},
+    {.data = "base", .size = 4},
+    {.data = "br", .size = 2},
+    {.data = "col", .size = 3},
+    {.data = "command", .size = 7},
+    {.data = "embed", .size = 5},
+    {.data = "hr", .size = 2},
+    {.data = "img", .size = 3},
+    {.data = "input", .size = 5},
+    {.data = "keygen", .size = 6},
+    {.data = "link", .size = 4},
+    {.data = "meta", .size = 4},
+    {.data = "param", .size = 5},
+    {.data = "source", .size = 6},
+    {.data = "track", .size = 5},
+    {.data = "wbr", .size = 3},
+    {.data = NULL, .size = 0}
+};
+
+static StringView script_tag = {.data = "script", .size = 6};
+
+static int string_view_equals(StringView *self, StringView *other) {
+    return self->size == other->size && memcmp(self->data, other->data, self->size) == 0;
+}
+
+static int is_whitespace(int c) {
+    switch(c) {
+        case ' ':
+        case '\n':
+        case '\r':
+        case '\t':
+        case '\v':
+            return 1;
+        default:
+            return 0;
+    }
+}
+
+static int is_newline(int c) {
+    return c == '\n' || c == '\r';
+}
+
+static void lstrip(const char *str, size_t size, const char **output_str, size_t *output_size, int(*strip_filter_func)(int)) {
+    size_t i = 0;
+    while(i < size && strip_filter_func(str[i])) {
+        ++i;
+    }
+    *output_str = str + i;
+    *output_size = size - i;
+}
+
+static void rstrip(const char *str, size_t size, size_t *output_size, int(*strip_filter_func)(int)) {
+    ssize_t i = size - 1;
+    while(i >= 0 && strip_filter_func(str[i])) {
+        --i;
+    }
+    *output_size = i + 1;
+}
+
+static void strip(const char *str, size_t size, const char **output_str, size_t *output_size, int(*strip_filter_func)(int)) {
+    lstrip(str, size, output_str, output_size, strip_filter_func);
+    rstrip(*output_str, *output_size, output_size, strip_filter_func);
+}
+
+static int is_void_tag(StringView *tag_name) {
+    StringView *tag_iter = &void_tags[0];
+    /* !DOCTYPE, !--, etc.... */
+    if(tag_name->size > 0 && tag_name->data[0] == '!')
+        return 1;
+    while(tag_iter->data) {
+        if(string_view_equals(tag_name, tag_iter))
+            return 1;
+        ++tag_iter;
+    }
+    return 0;
+}
+
+static void html_parser_reset(HtmlParser *self) {
+    self->offset = 0;
+    self->tag_name.data = NULL;
+    self->tag_name.size = 0;
+    self->attribute_key.data = NULL;
+    self->attribute_key.size = 0;
+    self->attribute_value.data = NULL;
+    self->attribute_value.size = 0;
+    self->text.data = NULL;
+    self->text.size = 0;
+    self->is_tag_void = 0;
+    self->inside_script_tag = 0;
+    self->unclosed_tags_offset = 0;
+}
+
+void html_parser_init(HtmlParser *self, const char *html_source, size_t len, HtmlParseCallback parse_callback, void *userdata) {
+    self->source = html_source;
+    self->source_len = len;
+    self->parse_callback = parse_callback;
+    self->callback_userdata = userdata;
+}
+
+void html_parser_deinit(HtmlParser *self) {
+    
+}
+
+static char html_parser_next_char(HtmlParser *self) {
+    if(self->offset < self->source_len) {
+        char c = self->source[self->offset];
+        ++self->offset;
+        return c;
+    }
+    return '\0';
+}
+
+static char html_parser_peek_char(HtmlParser *self) {
+    if(self->offset < self->source_len) {
+        char c = self->source[self->offset];
+        return c;
+    }
+    return '\0';
+}
+
+static void html_parser_advance_char(HtmlParser *self) {
+    ++self->offset;
+}
+
+static int is_alpha(char c) {
+    return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
+}
+
+static int is_digit(char c) {
+    return c >= '0' && c <= '9';
+}
+
+static int is_identifier_char(char c) {
+    return is_alpha(c) || is_digit(c) || c == '-' || c == '_' || c == '!';
+}
+
+static void html_parser_try_append_unclosed_tag(HtmlParser *self, const char *data, size_t size) {
+    if(self->unclosed_tags_offset == UNCLOSED_TAGS_SIZE) {
+        fprintf(stderr, "Reached the maximum number of unclosed tags! the html source is too broken\n");
+        return;
+    }
+    self->unclosed_tags[self->unclosed_tags_offset].data = data;
+    self->unclosed_tags[self->unclosed_tags_offset].size = size;
+    ++self->unclosed_tags_offset;
+}
+
+static void html_parser_pop_unclosed_tag(HtmlParser *self) {
+    assert(self->unclosed_tags_offset > 0);
+    --self->unclosed_tags_offset;
+}
+
+static void html_parser_try_pop_unclosed_tag(HtmlParser *self) {
+    if(self->unclosed_tags_offset > 0)
+        --self->unclosed_tags_offset;
+}
+
+static int html_parser_try_get_top_unclosed_tag(HtmlParser *self, StringView *result) {
+    if(self->unclosed_tags_offset > 0) {
+        *result = self->unclosed_tags[self->unclosed_tags_offset - 1];
+        return 1;
+    }
+    return 0;
+}
+
+static void html_parser_skip_whitespace(HtmlParser *self) {
+    for(;;) {
+        char c = html_parser_peek_char(self);
+        if(is_whitespace(c)) {
+            html_parser_advance_char(self);
+        } else {
+            break;
+        }
+    }
+}
+
+static int is_attribute_value_char(char c) {
+    switch(c) {
+        case '"':
+        case '\'':
+        case '`':
+        case '<':
+        case '>':
+        case '&':
+            return 0;
+        default:
+            return 1;
+    }
+}
+
+/* TODO: Unescape html characters in attribute value */
+static void html_parser_parse_attribute_value_quoted(HtmlParser *self, char quote_symbol) {
+    self->attribute_value.data = self->source + self->offset;
+    for(;;) {
+        char c = html_parser_peek_char(self);
+        if(c == quote_symbol) {
+            self->attribute_value.size = (self->source + self->offset) - self->attribute_value.data;
+            html_parser_advance_char(self);
+            break;
+        } else if(c == '\0') {
+            self->attribute_value.size = (self->source + self->offset) - self->attribute_value.data;
+            break;
+        } else {
+            html_parser_advance_char(self);
+        }
+    }
+    strip(self->attribute_value.data, self->attribute_value.size, &self->attribute_value.data, &self->attribute_value.size, is_newline);
+}
+
+static void html_parser_parse_attribute_value(HtmlParser *self) {
+    self->attribute_value.data = self->source + self->offset;
+    for(;;) {
+        char c = html_parser_peek_char(self);
+        if(!is_attribute_value_char(c) || c == '\0')
+            break;
+        else
+            html_parser_advance_char(self);
+    }
+    self->attribute_value.size = (self->source + self->offset) - self->attribute_value.data;
+}
+
+static void html_parser_goto_end_of_js_string(HtmlParser *self, char quote_symbol) {
+    int escape_quote = 0;
+    for(;;) {
+        char c = html_parser_next_char(self);
+        if(!escape_quote && c == quote_symbol) {
+            return;
+        } else if(c == '\\') {
+            escape_quote = !escape_quote;
+        } else if(c == '\0') {
+            return;
+        } else {
+            escape_quote = 0;
+        }
+    }
+}
+
+static void html_parser_goto_script_end_tag(HtmlParser *self) {
+    self->text.data = self->source + self->offset;
+    self->text.size = 0;
+    for(;;) {
+        char c = html_parser_peek_char(self);
+        if(c == '"' || c == '\'') {
+            html_parser_advance_char(self);
+            html_parser_goto_end_of_js_string(self, c);
+        } else if(c == '<' && self->offset + 7 < self->source_len && strncmp(self->source + self->offset + 1, "/script", 7) == 0) {
+            self->text.size = (self->source + self->offset) - self->text.data;
+            strip(self->text.data, self->text.size, &self->text.data, &self->text.size, is_whitespace);
+            self->offset += 7;
+            for(;;) {
+                c = html_parser_peek_char(self);
+                if(c == '>') {
+                    html_parser_advance_char(self);
+                    break;
+                } else if(c == '\0') {
+                    break;
+                } else {
+                    html_parser_advance_char(self);
+                }
+            }
+            break;
+        } else if(c == '\0') {
+            self->text.size = (self->source + self->offset) - self->text.data;
+            strip(self->text.data, self->text.size, &self->text.data, &self->text.size, is_whitespace);
+            break;
+        } else {
+            html_parser_advance_char(self);
+        }
+    }
+    if(self->text.size > 0)
+        self->parse_callback(self, HTML_PARSE_JAVASCRIPT_CODE, self->callback_userdata);
+}
+
+static void html_parser_parse_tag_start(HtmlParser *self) {
+    int tag_name_found = 0;
+    for(;;) {
+        char c = html_parser_next_char(self);
+        if(c == '>') {
+            if(self->is_tag_void)
+                self->parse_callback(self, HTML_PARSE_TAG_END, self->callback_userdata);
+            self->is_tag_void = 0;
+            
+            if(self->inside_script_tag) {
+                self->inside_script_tag = 0;
+                /* <script> tags require special handling since they can have </script> inside a javascript string */
+                html_parser_goto_script_end_tag(self);
+            }
+            return;
+        } else if(c == '/') {
+            if(html_parser_peek_char(self) == '>') {
+                html_parser_advance_char(self);
+                if(tag_name_found) {
+                    self->parse_callback(self, HTML_PARSE_TAG_END, self->callback_userdata);
+                    if(!self->is_tag_void)
+                        html_parser_try_pop_unclosed_tag(self);
+                }
+                self->is_tag_void = 0;
+                self->inside_script_tag = 0;
+                return;
+            }
+        } else if(is_identifier_char(c)) {
+            StringView identifier;
+            identifier.data = self->source + self->offset - 1;
+            for(;;) {
+                c = html_parser_peek_char(self);
+                if(is_identifier_char(c)) {
+                    html_parser_advance_char(self);
+                } else {
+                    break;
+                }
+            }
+            identifier.size = (self->source + self->offset) - identifier.data;
+            if(tag_name_found) {
+                /* attribute name */
+                self->attribute_key = identifier;
+                self->attribute_value.data = NULL;
+                self->attribute_value.size = 0;
+
+                html_parser_skip_whitespace(self);
+                c = html_parser_peek_char(self);
+                if(c == '=') {
+                    html_parser_advance_char(self);
+                    html_parser_skip_whitespace(self);
+                    c = html_parser_peek_char(self);
+                    if(c == '"' || c == '\'' || c == '`') {
+                        html_parser_advance_char(self);
+                        html_parser_parse_attribute_value_quoted(self, c);
+                    } else if(is_attribute_value_char(c)) {
+                        html_parser_advance_char(self);
+                        html_parser_parse_attribute_value(self);
+                    }
+                }
+                self->parse_callback(self, HTML_PARSE_ATTRIBUTE, self->callback_userdata);
+            } else {
+                /* tag name */
+                self->tag_name = identifier;
+                tag_name_found = 1;
+                self->is_tag_void = is_void_tag(&self->tag_name);
+                if(!self->is_tag_void) {
+                    html_parser_try_append_unclosed_tag(self, self->tag_name.data, self->tag_name.size);
+                    self->inside_script_tag = string_view_equals(&self->tag_name, &script_tag);
+                }
+                self->parse_callback(self, HTML_PARSE_TAG_START, self->callback_userdata);
+            }
+        } else if(c == '\0') {
+            return;
+        }
+    }
+}
+
+static void html_parser_parse_tag_end(HtmlParser *self) {
+    int tag_name_found = 0;
+    for(;;) {
+        char c = html_parser_peek_char(self);
+        if(c == '>') {
+            html_parser_advance_char(self);
+            return;
+        } else if(!tag_name_found && is_identifier_char(c)) {
+            StringView tag_end_name;
+            tag_end_name.data = self->source + self->offset;
+            html_parser_advance_char(self);
+            for(;;) {
+                c = html_parser_peek_char(self);
+                if(is_identifier_char(c)) {
+                    html_parser_advance_char(self);
+                } else {
+                    break;
+                }
+            }
+            tag_end_name.size = (self->source + self->offset) - tag_end_name.data;
+
+            /* void tags close themselves, this is probably invalid html but we choose to ignore it silently */
+            if(is_void_tag(&tag_end_name)) {
+                fprintf(stderr, "Warning: got end tag for void tag '%.*s'\n", tag_end_name.size, tag_end_name.data);
+                continue;
+            }
+
+            StringView top_unclosed_tag;
+            while(html_parser_try_get_top_unclosed_tag(self, &top_unclosed_tag)) {
+                self->tag_name = top_unclosed_tag;
+                self->parse_callback(self, HTML_PARSE_TAG_END, self->callback_userdata);
+                html_parser_pop_unclosed_tag(self);
+                if(string_view_equals(&top_unclosed_tag, &tag_end_name))
+                    break;
+            }
+        } else if(c == '\0') {
+            return;
+        } else {
+            html_parser_advance_char(self);
+        }
+    }
+}
+
+void html_parser_parse(HtmlParser *self) {
+    html_parser_reset(self);
+    for(;;) {
+        char c = html_parser_next_char(self);
+        if(c == '<') {
+            if(html_parser_peek_char(self) == '/') {
+                html_parser_advance_char(self);
+                html_parser_parse_tag_end(self);
+            } else {
+                html_parser_parse_tag_start(self);
+            }
+        } else if(c == '\0') {
+            break;
+        } else {
+            self->text.data = (self->source + self->offset) - 1;
+            for(;;) {
+                c = html_parser_peek_char(self);
+                if(c == '<' || c == '\0')
+                    break;
+                else
+                    html_parser_advance_char(self);
+            }
+            self->text.size = (self->source + self->offset) - self->text.data;
+            strip(self->text.data, self->text.size, &self->text.data, &self->text.size, is_whitespace);
+            if(self->text.size > 0)
+                self->parse_callback(self, HTML_PARSE_TEXT, self->callback_userdata);
+        }
+    }
+
+    StringView top_unclosed_tag;
+    while(html_parser_try_get_top_unclosed_tag(self, &top_unclosed_tag)) {
+        self->tag_name = top_unclosed_tag;
+        self->parse_callback(self, HTML_PARSE_TAG_END, self->callback_userdata);
+        html_parser_pop_unclosed_tag(self);
+    }
+}
diff --git a/tests/main.c b/tests/main.c
new file mode 100644
index 0000000..ff1570b
--- /dev/null
+++ b/tests/main.c
@@ -0,0 +1,6 @@
+#include <stdio.h>
+
+int main(int argc, char **argv) {
+    printf("hello, world!\n");
+    return 0;
+}
author	DEC05EBA <dec05eba@protonmail.com>	2019-12-31 08:46:05 +0100
committer	DEC05EBA <dec05eba@protonmail.com>	2019-12-31 08:49:12 +0100
commit	ac0a3e0ebb9b460a31a76115cb4d494361c03e49 (patch)
tree	50cbb9a689934792372eb998d3f2653776194829