From 534c441fd8172322ff5eaad54a1d26b9d8492c39 Mon Sep 17 00:00:00 2001 From: dec05eba Date: Wed, 15 Sep 2021 18:22:45 +0200 Subject: Initial commit, finished --- .gitignore | 5 ++ .gitmodules | 3 + LICENSE | 19 ++++++ README.md | 5 ++ depends/html-parser | 1 + include/HtmlTree.h | 61 +++++++++++++++++++ project.conf | 12 ++++ src/HtmlTree.c | 171 ++++++++++++++++++++++++++++++++++++++++++++++++++++ tests/main.c | 80 ++++++++++++++++++++++++ 9 files changed, 357 insertions(+) create mode 100644 .gitignore create mode 100644 .gitmodules create mode 100644 LICENSE create mode 100644 README.md create mode 160000 depends/html-parser create mode 100644 include/HtmlTree.h create mode 100644 project.conf create mode 100644 src/HtmlTree.c create mode 100644 tests/main.c diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..636c6b9 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +# Compiled sibs files +sibs-build/ +compile_commands.json +tests/sibs-build/ +tests/compile_commands.json diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..5475f6a --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "depends/html-parser"] + path = depends/html-parser + url = git://git.dec05eba.com/html-parser diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..b79dbf4 --- /dev/null +++ b/LICENSE @@ -0,0 +1,19 @@ +Copyright (c) 2021 dec05eba + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..eda1553 --- /dev/null +++ b/README.md @@ -0,0 +1,5 @@ +A small html parser written in ansi c (c89). The parser fixes broken html (missing end tags). + +This html parser can also be used to parse xml files with namespaces, such as rss feeds. +# Note +This library does not decode html sequences in text and attribute values diff --git a/depends/html-parser b/depends/html-parser new file mode 160000 index 0000000..fe3993c --- /dev/null +++ b/depends/html-parser @@ -0,0 +1 @@ +Subproject commit fe3993c221a604f5fb9f7ef1ba6179740cbf9173 diff --git a/include/HtmlTree.h b/include/HtmlTree.h new file mode 100644 index 0000000..6bb3c5f --- /dev/null +++ b/include/HtmlTree.h @@ -0,0 +1,61 @@ +#ifndef HTML_TREE_H +#define HTML_TREE_H + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct HtmlNode HtmlNode; +typedef struct HtmlNodeChild HtmlNodeChild; +typedef struct HtmlAttribute HtmlAttribute; + +typedef enum { + HTML_NODE_NODE, + HTML_NODE_TEXT, + HTML_NODE_JS +} HtmlNodeType; + +/* text and javascript code are also html nodes */ +struct HtmlNode { + HtmlStringView name_or_value; /* name of the node if |node_type| is HTML_NODE_NODE, otherwise the text/code value */ + HtmlNode *parent; + HtmlNodeChild *first_child; + HtmlNodeChild *last_child; + HtmlAttribute *first_attr; + HtmlNodeType node_type; +}; + +struct HtmlNodeChild { + HtmlNode *node; + HtmlNodeChild *next; +}; + +struct HtmlAttribute { + HtmlStringView key; + HtmlStringView value; + HtmlAttribute *next; +}; + +typedef struct { + HtmlNode root_node; /* This is not the node. The root_node is always a node with no name and no attributes and only has children */ +} HtmlTree; + +/* + Returns 0 on success. + Input text is expected to be in utf8 and may or may not have UTF8-BOM. + |html_source| has to be valid until |html_tree_deinit| is called to free the html node |self|. +*/ +int html_parse_to_tree(HtmlTree *self, const char *html_source, size_t len); +void html_tree_deinit(HtmlTree *self); + +/* Case insensitive match. Returns NULL if not found */ +HtmlAttribute* html_node_get_attribute_by_name(HtmlNode *self, HtmlStringView name); + +#ifdef __cplusplus +} +#endif + +#endif /* HTML_TREE_H */ diff --git a/project.conf b/project.conf new file mode 100644 index 0000000..a400964 --- /dev/null +++ b/project.conf @@ -0,0 +1,12 @@ +[package] +name = "html-tree" +type = "static" +version = "0.1.0" +platforms = ["any"] + +[lang.c] +version = "c89" + +[config] +expose_include_dirs = ["include"] +error_on_warning = "true" diff --git a/src/HtmlTree.c b/src/HtmlTree.c new file mode 100644 index 0000000..f4deb88 --- /dev/null +++ b/src/HtmlTree.c @@ -0,0 +1,171 @@ +#include "../include/HtmlTree.h" +#include + +static void html_node_deinit(HtmlNode *self); + +static void html_node_init(HtmlNode *self) { + self->name_or_value.data = NULL; + self->name_or_value.size = 0; + self->parent = NULL; + self->first_child = NULL; + self->last_child = NULL; + self->first_attr = NULL; + self->node_type = HTML_NODE_NODE; +} + +static HtmlNode* html_node_create(HtmlStringView name_or_value, HtmlNode *parent, HtmlNodeType node_type) { + HtmlNode *new_node = malloc(sizeof(HtmlNode)); + if(!new_node) + return NULL; + + new_node->name_or_value = name_or_value; + new_node->parent = parent; + new_node->first_child = NULL; + new_node->last_child = NULL; + new_node->first_attr = NULL; + new_node->node_type = node_type; + + if(parent) { + HtmlNodeChild *node_child = malloc(sizeof(HtmlNodeChild)); + if(!node_child) { + free(new_node); + return NULL; + } + + node_child->node = new_node; + node_child->next = NULL; + + if(!parent->first_child) { + parent->first_child = node_child; + parent->last_child = node_child; + } else { + parent->last_child->next = node_child; + parent->last_child = node_child; + } + } + + return new_node; +} + +static void html_node_child_deinit(HtmlNodeChild *self) { + html_node_deinit(self->node); + free(self->node); + self->node = NULL; + + if(self->next) { + html_node_child_deinit(self->next); + free(self->next); + self->next = NULL; + } +} + +static void html_attribute_deinit(HtmlAttribute *self) { + if(self->next) { + html_attribute_deinit(self->next); + free(self->next); + self->next = NULL; + } +} + +void html_node_deinit(HtmlNode *self) { + self->name_or_value.data = NULL; + self->name_or_value.size = 0; + self->parent = NULL; + + if(self->first_child) { + html_node_child_deinit(self->first_child); + free(self->first_child); + self->first_child = NULL; + self->last_child = NULL; + } + + if(self->first_attr) { + html_attribute_deinit(self->first_attr); + free(self->first_attr); + self->first_attr = NULL; + } +} + +typedef struct { + HtmlNode *current_node; + HtmlAttribute *current_node_last_attribute; +} ParseUserdata; + +static int parse_callback(HtmlParser *html_parser, HtmlParseType parse_type, void *userdata) { + ParseUserdata *parse_userdata = userdata; + + switch(parse_type) { + case HTML_PARSE_TAG_START: { + HtmlNode *new_node = html_node_create(html_parser->tag_name, parse_userdata->current_node, HTML_NODE_NODE); + if(!new_node) + return 1; + + parse_userdata->current_node = new_node; + parse_userdata->current_node_last_attribute = NULL; + break; + } + case HTML_PARSE_TAG_END: { + HtmlNode *parent_node = parse_userdata->current_node->parent; + if(parent_node) { + parse_userdata->current_node = parent_node; + parse_userdata->current_node_last_attribute = NULL; + } + break; + } + case HTML_PARSE_ATTRIBUTE: { + HtmlAttribute *new_attr = malloc(sizeof(HtmlAttribute)); + if(!new_attr) + return 1; + + new_attr->key = html_parser->attribute_key; + new_attr->value = html_parser->attribute_value; + new_attr->next = NULL; + + if(parse_userdata->current_node_last_attribute) + parse_userdata->current_node_last_attribute->next = new_attr; + else + parse_userdata->current_node->first_attr = new_attr; + + parse_userdata->current_node_last_attribute = new_attr; + break; + } + case HTML_PARSE_TEXT: + case HTML_PARSE_JAVASCRIPT_CODE: { + HtmlNode *new_node = html_node_create(html_parser->text_stripped, parse_userdata->current_node, parse_type == HTML_PARSE_TEXT ? HTML_NODE_TEXT : HTML_NODE_JS); + if(!new_node) + return 1; + + parse_userdata->current_node_last_attribute = NULL; + break; + } + } + + return 0; +} + +int html_parse_to_tree(HtmlTree *self, const char *html_source, size_t len) { + int result; + ParseUserdata parse_userdata; + html_node_init(&self->root_node); + + parse_userdata.current_node = &self->root_node; + parse_userdata.current_node_last_attribute = NULL; + result = html_parser_parse(html_source, len, parse_callback, &parse_userdata); + + if(result != 0) + html_tree_deinit(self); + return result; +} + +void html_tree_deinit(HtmlTree *self) { + html_node_deinit(&self->root_node); +} + +HtmlAttribute* html_node_get_attribute_by_name(HtmlNode *self, HtmlStringView name) { + HtmlAttribute *attr = self->first_attr; + for(; attr; attr = attr->next) { + if(html_string_view_equals_case_insensitive(&attr->key, &name)) + return attr; + } + return NULL; +} diff --git a/tests/main.c b/tests/main.c new file mode 100644 index 0000000..3b22f8a --- /dev/null +++ b/tests/main.c @@ -0,0 +1,80 @@ +#include "../include/HtmlTree.h" +#include +#include +#include + +char* file_get_content(const char *path, long *filesize) { + FILE *file = fopen(path, "rb"); + if(!file) { + perror(path); + return NULL; + } + + fseek(file, 0, SEEK_END); + *filesize = ftell(file); + fseek(file, 0, SEEK_SET); + + char *data = malloc(*filesize); + fread(data, 1, *filesize, file); + fclose(file); + return data; +} + +static void html_attributes_print(HtmlAttribute *attr) { + while(attr) { + printf("%.*s=\"%.*s\" ", (int)attr->key.size, attr->key.data, (int)attr->value.size, attr->value.data); + attr = attr->next; + } +} + +static void html_node_print(HtmlNode *node); +static void html_node_child_print(HtmlNodeChild *node_child) { + while(node_child) { + html_node_print(node_child->node); + node_child = node_child->next; + } +} + +void html_node_print(HtmlNode *node) { + switch(node->node_type) { + case HTML_NODE_NODE: { + printf("<%.*s ", (int)node->name_or_value.size, node->name_or_value.data); + html_attributes_print(node->first_attr); + printf(">\n"); + html_node_child_print(node->first_child); + printf("\n", (int)node->name_or_value.size, node->name_or_value.data); + break; + } + case HTML_NODE_TEXT: { + printf("%.*s", (int)node->name_or_value.size, node->name_or_value.data); + break; + } + case HTML_NODE_JS: { + printf("%.*s", (int)node->name_or_value.size, node->name_or_value.data); + break; + } + } +} + +int main() { + int result; + HtmlTree html_tree; + long filesize; + char *file_data; + + file_data = file_get_content("depends/html-parser/tests/hotexamples.html", &filesize); + if(!file_data) { + fprintf(stderr, "Failed to read from file: depends/html-parser/tests/hotexamples.html\n"); + return 1; + } + + result = html_parse_to_tree(&html_tree, file_data, filesize); + if(result != 0) + return result; + + html_node_print(&html_tree.root_node); + + html_tree_deinit(&html_tree); + free(file_data); + return 0; +} -- cgit v1.2.3