aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authordec05eba <dec05eba@protonmail.com>2021-09-15 18:22:45 +0200
committerdec05eba <dec05eba@protonmail.com>2021-09-15 18:22:45 +0200
commit534c441fd8172322ff5eaad54a1d26b9d8492c39 (patch)
tree78a66cddca0e26d958a7dd7514acf122834c53cf
Initial commit, finished
-rw-r--r--.gitignore5
-rw-r--r--.gitmodules3
-rw-r--r--LICENSE19
-rw-r--r--README.md5
m---------depends/html-parser0
-rw-r--r--include/HtmlTree.h61
-rw-r--r--project.conf12
-rw-r--r--src/HtmlTree.c171
-rw-r--r--tests/main.c80
9 files changed, 356 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..636c6b9
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,5 @@
+# Compiled sibs files
+sibs-build/
+compile_commands.json
+tests/sibs-build/
+tests/compile_commands.json
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..5475f6a
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "depends/html-parser"]
+ path = depends/html-parser
+ url = git://git.dec05eba.com/html-parser
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..b79dbf4
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,19 @@
+Copyright (c) 2021 dec05eba
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..eda1553
--- /dev/null
+++ b/README.md
@@ -0,0 +1,5 @@
+A small html parser written in ansi c (c89). The parser fixes broken html (missing end tags).
+
+This html parser can also be used to parse xml files with namespaces, such as rss feeds.
+# Note
+This library does not decode html sequences in text and attribute values
diff --git a/depends/html-parser b/depends/html-parser
new file mode 160000
+Subproject fe3993c221a604f5fb9f7ef1ba6179740cbf917
diff --git a/include/HtmlTree.h b/include/HtmlTree.h
new file mode 100644
index 0000000..6bb3c5f
--- /dev/null
+++ b/include/HtmlTree.h
@@ -0,0 +1,61 @@
+#ifndef HTML_TREE_H
+#define HTML_TREE_H
+
+#include <stddef.h>
+#include <HtmlParser.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct HtmlNode HtmlNode;
+typedef struct HtmlNodeChild HtmlNodeChild;
+typedef struct HtmlAttribute HtmlAttribute;
+
+typedef enum {
+ HTML_NODE_NODE,
+ HTML_NODE_TEXT,
+ HTML_NODE_JS
+} HtmlNodeType;
+
+/* text and javascript code are also html nodes */
+struct HtmlNode {
+ HtmlStringView name_or_value; /* name of the node if |node_type| is HTML_NODE_NODE, otherwise the text/code value */
+ HtmlNode *parent;
+ HtmlNodeChild *first_child;
+ HtmlNodeChild *last_child;
+ HtmlAttribute *first_attr;
+ HtmlNodeType node_type;
+};
+
+struct HtmlNodeChild {
+ HtmlNode *node;
+ HtmlNodeChild *next;
+};
+
+struct HtmlAttribute {
+ HtmlStringView key;
+ HtmlStringView value;
+ HtmlAttribute *next;
+};
+
+typedef struct {
+ HtmlNode root_node; /* This is not the <html> node. The root_node is always a node with no name and no attributes and only has children */
+} HtmlTree;
+
+/*
+ Returns 0 on success.
+ Input text is expected to be in utf8 and may or may not have UTF8-BOM.
+ |html_source| has to be valid until |html_tree_deinit| is called to free the html node |self|.
+*/
+int html_parse_to_tree(HtmlTree *self, const char *html_source, size_t len);
+void html_tree_deinit(HtmlTree *self);
+
+/* Case insensitive match. Returns NULL if not found */
+HtmlAttribute* html_node_get_attribute_by_name(HtmlNode *self, HtmlStringView name);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* HTML_TREE_H */
diff --git a/project.conf b/project.conf
new file mode 100644
index 0000000..a400964
--- /dev/null
+++ b/project.conf
@@ -0,0 +1,12 @@
+[package]
+name = "html-tree"
+type = "static"
+version = "0.1.0"
+platforms = ["any"]
+
+[lang.c]
+version = "c89"
+
+[config]
+expose_include_dirs = ["include"]
+error_on_warning = "true"
diff --git a/src/HtmlTree.c b/src/HtmlTree.c
new file mode 100644
index 0000000..f4deb88
--- /dev/null
+++ b/src/HtmlTree.c
@@ -0,0 +1,171 @@
+#include "../include/HtmlTree.h"
+#include <stdlib.h>
+
+static void html_node_deinit(HtmlNode *self);
+
+static void html_node_init(HtmlNode *self) {
+ self->name_or_value.data = NULL;
+ self->name_or_value.size = 0;
+ self->parent = NULL;
+ self->first_child = NULL;
+ self->last_child = NULL;
+ self->first_attr = NULL;
+ self->node_type = HTML_NODE_NODE;
+}
+
+static HtmlNode* html_node_create(HtmlStringView name_or_value, HtmlNode *parent, HtmlNodeType node_type) {
+ HtmlNode *new_node = malloc(sizeof(HtmlNode));
+ if(!new_node)
+ return NULL;
+
+ new_node->name_or_value = name_or_value;
+ new_node->parent = parent;
+ new_node->first_child = NULL;
+ new_node->last_child = NULL;
+ new_node->first_attr = NULL;
+ new_node->node_type = node_type;
+
+ if(parent) {
+ HtmlNodeChild *node_child = malloc(sizeof(HtmlNodeChild));
+ if(!node_child) {
+ free(new_node);
+ return NULL;
+ }
+
+ node_child->node = new_node;
+ node_child->next = NULL;
+
+ if(!parent->first_child) {
+ parent->first_child = node_child;
+ parent->last_child = node_child;
+ } else {
+ parent->last_child->next = node_child;
+ parent->last_child = node_child;
+ }
+ }
+
+ return new_node;
+}
+
+static void html_node_child_deinit(HtmlNodeChild *self) {
+ html_node_deinit(self->node);
+ free(self->node);
+ self->node = NULL;
+
+ if(self->next) {
+ html_node_child_deinit(self->next);
+ free(self->next);
+ self->next = NULL;
+ }
+}
+
+static void html_attribute_deinit(HtmlAttribute *self) {
+ if(self->next) {
+ html_attribute_deinit(self->next);
+ free(self->next);
+ self->next = NULL;
+ }
+}
+
+void html_node_deinit(HtmlNode *self) {
+ self->name_or_value.data = NULL;
+ self->name_or_value.size = 0;
+ self->parent = NULL;
+
+ if(self->first_child) {
+ html_node_child_deinit(self->first_child);
+ free(self->first_child);
+ self->first_child = NULL;
+ self->last_child = NULL;
+ }
+
+ if(self->first_attr) {
+ html_attribute_deinit(self->first_attr);
+ free(self->first_attr);
+ self->first_attr = NULL;
+ }
+}
+
+typedef struct {
+ HtmlNode *current_node;
+ HtmlAttribute *current_node_last_attribute;
+} ParseUserdata;
+
+static int parse_callback(HtmlParser *html_parser, HtmlParseType parse_type, void *userdata) {
+ ParseUserdata *parse_userdata = userdata;
+
+ switch(parse_type) {
+ case HTML_PARSE_TAG_START: {
+ HtmlNode *new_node = html_node_create(html_parser->tag_name, parse_userdata->current_node, HTML_NODE_NODE);
+ if(!new_node)
+ return 1;
+
+ parse_userdata->current_node = new_node;
+ parse_userdata->current_node_last_attribute = NULL;
+ break;
+ }
+ case HTML_PARSE_TAG_END: {
+ HtmlNode *parent_node = parse_userdata->current_node->parent;
+ if(parent_node) {
+ parse_userdata->current_node = parent_node;
+ parse_userdata->current_node_last_attribute = NULL;
+ }
+ break;
+ }
+ case HTML_PARSE_ATTRIBUTE: {
+ HtmlAttribute *new_attr = malloc(sizeof(HtmlAttribute));
+ if(!new_attr)
+ return 1;
+
+ new_attr->key = html_parser->attribute_key;
+ new_attr->value = html_parser->attribute_value;
+ new_attr->next = NULL;
+
+ if(parse_userdata->current_node_last_attribute)
+ parse_userdata->current_node_last_attribute->next = new_attr;
+ else
+ parse_userdata->current_node->first_attr = new_attr;
+
+ parse_userdata->current_node_last_attribute = new_attr;
+ break;
+ }
+ case HTML_PARSE_TEXT:
+ case HTML_PARSE_JAVASCRIPT_CODE: {
+ HtmlNode *new_node = html_node_create(html_parser->text_stripped, parse_userdata->current_node, parse_type == HTML_PARSE_TEXT ? HTML_NODE_TEXT : HTML_NODE_JS);
+ if(!new_node)
+ return 1;
+
+ parse_userdata->current_node_last_attribute = NULL;
+ break;
+ }
+ }
+
+ return 0;
+}
+
+int html_parse_to_tree(HtmlTree *self, const char *html_source, size_t len) {
+ int result;
+ ParseUserdata parse_userdata;
+ html_node_init(&self->root_node);
+
+ parse_userdata.current_node = &self->root_node;
+ parse_userdata.current_node_last_attribute = NULL;
+ result = html_parser_parse(html_source, len, parse_callback, &parse_userdata);
+
+ if(result != 0)
+ html_tree_deinit(self);
+ return result;
+}
+
+void html_tree_deinit(HtmlTree *self) {
+ html_node_deinit(&self->root_node);
+}
+
+HtmlAttribute* html_node_get_attribute_by_name(HtmlNode *self, HtmlStringView name) {
+ HtmlAttribute *attr = self->first_attr;
+ for(; attr; attr = attr->next) {
+ if(html_string_view_equals_case_insensitive(&attr->key, &name))
+ return attr;
+ }
+ return NULL;
+}
diff --git a/tests/main.c b/tests/main.c
new file mode 100644
index 0000000..3b22f8a
--- /dev/null
+++ b/tests/main.c
@@ -0,0 +1,80 @@
+#include "../include/HtmlTree.h"
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+char* file_get_content(const char *path, long *filesize) {
+ FILE *file = fopen(path, "rb");
+ if(!file) {
+ perror(path);
+ return NULL;
+ }
+
+ fseek(file, 0, SEEK_END);
+ *filesize = ftell(file);
+ fseek(file, 0, SEEK_SET);
+
+ char *data = malloc(*filesize);
+ fread(data, 1, *filesize, file);
+ fclose(file);
+ return data;
+}
+
+static void html_attributes_print(HtmlAttribute *attr) {
+ while(attr) {
+ printf("%.*s=\"%.*s\" ", (int)attr->key.size, attr->key.data, (int)attr->value.size, attr->value.data);
+ attr = attr->next;
+ }
+}
+
+static void html_node_print(HtmlNode *node);
+static void html_node_child_print(HtmlNodeChild *node_child) {
+ while(node_child) {
+ html_node_print(node_child->node);
+ node_child = node_child->next;
+ }
+}
+
+void html_node_print(HtmlNode *node) {
+ switch(node->node_type) {
+ case HTML_NODE_NODE: {
+ printf("<%.*s ", (int)node->name_or_value.size, node->name_or_value.data);
+ html_attributes_print(node->first_attr);
+ printf(">\n");
+ html_node_child_print(node->first_child);
+ printf("</%.*s>\n", (int)node->name_or_value.size, node->name_or_value.data);
+ break;
+ }
+ case HTML_NODE_TEXT: {
+ printf("%.*s", (int)node->name_or_value.size, node->name_or_value.data);
+ break;
+ }
+ case HTML_NODE_JS: {
+ printf("%.*s", (int)node->name_or_value.size, node->name_or_value.data);
+ break;
+ }
+ }
+}
+
+int main() {
+ int result;
+ HtmlTree html_tree;
+ long filesize;
+ char *file_data;
+
+ file_data = file_get_content("depends/html-parser/tests/hotexamples.html", &filesize);
+ if(!file_data) {
+ fprintf(stderr, "Failed to read from file: depends/html-parser/tests/hotexamples.html\n");
+ return 1;
+ }
+
+ result = html_parse_to_tree(&html_tree, file_data, filesize);
+ if(result != 0)
+ return result;
+
+ html_node_print(&html_tree.root_node);
+
+ html_tree_deinit(&html_tree);
+ free(file_data);
+ return 0;
+}