aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--include/HtmlParser.h6
-rw-r--r--src/HtmlParser.c51
2 files changed, 37 insertions, 20 deletions
diff --git a/include/HtmlParser.h b/include/HtmlParser.h
index 45dc1e1..7536777 100644
--- a/include/HtmlParser.h
+++ b/include/HtmlParser.h
@@ -9,7 +9,7 @@
#include <stddef.h>
typedef struct {
- const char *data;
+ char *data;
size_t size;
} HtmlStringView;
@@ -28,7 +28,7 @@ typedef void (*HtmlParseCallback)(HtmlParser *html_parser, HtmlParseType parse_t
#define UNCLOSED_TAGS_SIZE 2048
struct HtmlParser {
- const char *source;
+ char *source;
size_t source_len;
size_t offset;
HtmlParseCallback parse_callback;
@@ -48,7 +48,7 @@ struct HtmlParser {
};
/* Note: HTML_PARSE_TAG_START is guaranteed to be called for a tag before HTML_PARSE_TAG_END */
-void html_parser_init(HtmlParser *self, const char *html_source, size_t len, HtmlParseCallback parse_callback, void *userdata);
+void html_parser_init(HtmlParser *self, char *html_source, size_t len, HtmlParseCallback parse_callback, void *userdata);
void html_parser_deinit(HtmlParser *self);
void html_parser_parse(HtmlParser *self);
diff --git a/src/HtmlParser.c b/src/HtmlParser.c
index 965368e..8b27d6d 100644
--- a/src/HtmlParser.c
+++ b/src/HtmlParser.c
@@ -46,7 +46,7 @@ static int is_newline(int c) {
return c == '\n' || c == '\r';
}
-static void lstrip(const char *str, size_t size, const char **output_str, size_t *output_size, int(*strip_filter_func)(int)) {
+static void lstrip(char *str, size_t size, char **output_str, size_t *output_size, int(*strip_filter_func)(int)) {
size_t i = 0;
while(i < size && strip_filter_func(str[i])) {
++i;
@@ -55,7 +55,7 @@ static void lstrip(const char *str, size_t size, const char **output_str, size_t
*output_size = size - i;
}
-static void rstrip(const char *str, size_t size, size_t *output_size, int(*strip_filter_func)(int)) {
+static void rstrip(char *str, size_t size, size_t *output_size, int(*strip_filter_func)(int)) {
ssize_t i = size - 1;
while(i >= 0 && strip_filter_func(str[i])) {
--i;
@@ -63,11 +63,20 @@ static void rstrip(const char *str, size_t size, size_t *output_size, int(*strip
*output_size = i + 1;
}
-static void strip(const char *str, size_t size, const char **output_str, size_t *output_size, int(*strip_filter_func)(int)) {
+static void strip(char *str, size_t size, char **output_str, size_t *output_size, int(*strip_filter_func)(int)) {
lstrip(str, size, output_str, output_size, strip_filter_func);
rstrip(*output_str, *output_size, output_size, strip_filter_func);
}
+static void html_string_view_to_lowercase(HtmlStringView *string_view) {
+ size_t i = 0;
+ for(; i < string_view->size; ++i) {
+ char c = string_view->data[i];
+ if(c >= 'A' && c <= 'Z')
+ string_view->data[i] += 32;
+ }
+}
+
static int is_void_tag(HtmlStringView *tag_name) {
HtmlStringView *tag_iter = &void_tags[0];
/* !DOCTYPE, !--, etc.... */
@@ -98,7 +107,7 @@ static void html_parser_reset(HtmlParser *self) {
self->unclosed_tags_offset = 0;
}
-void html_parser_init(HtmlParser *self, const char *html_source, size_t len, HtmlParseCallback parse_callback, void *userdata) {
+void html_parser_init(HtmlParser *self, char *html_source, size_t len, HtmlParseCallback parse_callback, void *userdata) {
self->source = html_source;
self->source_len = len;
self->parse_callback = parse_callback;
@@ -131,19 +140,7 @@ static void html_parser_advance_char(HtmlParser *self) {
++self->offset;
}
-static int is_alpha(char c) {
- return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
-}
-
-static int is_digit(char c) {
- return c >= '0' && c <= '9';
-}
-
-static int is_identifier_char(char c) {
- return is_alpha(c) || is_digit(c) || c == '-' || c == '_' || c == '!' || c == ':';
-}
-
-static void html_parser_try_append_unclosed_tag(HtmlParser *self, const char *data, size_t size) {
+static void html_parser_try_append_unclosed_tag(HtmlParser *self, char *data, size_t size) {
if(self->unclosed_tags_offset == UNCLOSED_TAGS_SIZE) {
fprintf(stderr, "Reached the maximum number of unclosed tags! the html source is too broken\n");
return;
@@ -196,6 +193,24 @@ static int is_attribute_value_char(char c) {
}
}
+static int is_identifier_char(char c) {
+ switch(c) {
+ case ' ':
+ case '\t':
+ case '\n':
+ case '\v':
+ case '"':
+ case '\'':
+ case '<':
+ case '>':
+ case '/':
+ case '=':
+ return 0;
+ default:
+ return 1;
+ }
+}
+
/* TODO: Unescape html characters in attribute value */
static void html_parser_parse_attribute_value_quoted(HtmlParser *self, char quote_symbol) {
self->attribute_value.data = self->source + self->offset;
@@ -330,6 +345,7 @@ static void html_parser_parse_tag_start(HtmlParser *self) {
if(tag_name_found) {
/* attribute name */
self->attribute_key = identifier;
+ html_string_view_to_lowercase(&self->attribute_key);
self->attribute_value.data = NULL;
self->attribute_value.size = 0;
@@ -351,6 +367,7 @@ static void html_parser_parse_tag_start(HtmlParser *self) {
} else {
/* tag name */
self->tag_name = identifier;
+ html_string_view_to_lowercase(&self->tag_name);
tag_name_found = 1;
if(self->tag_name.size == 3 && memcmp(self->tag_name.data, "!--", 3) == 0) {
html_parser_goto_comment_end(self);