diff options
Diffstat (limited to 'lib/htmlparser')
-rw-r--r-- | lib/htmlparser/doc.go | 3 | ||||
-rw-r--r-- | lib/htmlparser/htmlparser.go | 36 |
2 files changed, 38 insertions, 1 deletions
diff --git a/lib/htmlparser/doc.go b/lib/htmlparser/doc.go new file mode 100644 index 0000000..0e31960 --- /dev/null +++ b/lib/htmlparser/doc.go @@ -0,0 +1,3 @@ +// Package htmlparser contains a HTML parsing system similar to html.parser.HTMLParser in Python 3. +// The parser uses x/net/html.Tokenizer in the background. +package htmlparser diff --git a/lib/htmlparser/htmlparser.go b/lib/htmlparser/htmlparser.go index fb4e012..5ef8d98 100644 --- a/lib/htmlparser/htmlparser.go +++ b/lib/htmlparser/htmlparser.go @@ -23,21 +23,43 @@ import ( "golang.org/x/net/html" ) +// HTMLProcessor contains the functions to process parsed HTML data. type HTMLProcessor interface { + // Preprocess is called before the parsing is started. Preprocess() + + // HandleStartTag is called with the tag name and attributes when + // the parser encounters a StartTagToken, except if the tag is + // always self-closing. HandleStartTag(tagName string, attrs map[string]string) + // HandleSelfClosingTag is called with the tag name and attributes + // when the parser encounters a SelfClosingTagToken OR a StartTagToken + // with a tag that's always self-closing. HandleSelfClosingTag(tagName string, attrs map[string]string) + // HandleText is called with the text when the parser encounters + // a TextToken. HandleText(text string) + // HandleEndTag is called with the tag name when the parser encounters + // an EndTagToken. HandleEndTag(tagName string) + + // ReceiveError is called with the error when the parser encounters + // an ErrorToken that IS NOT io.EOF. ReceiveError(err error) + + // Postprocess is called after parsing is completed successfully. + // An unsuccessful parsing will trigger a ReceiveError() call. Postprocess() } +// HTMLParser wraps a net/html.Tokenizer and a HTMLProcessor to call +// the HTMLProcessor with data from the Tokenizer. type HTMLParser struct { *html.Tokenizer processor HTMLProcessor } +// NewHTMLParserFromTokenizer creates a new HTMLParser from an existing html Tokenizer. func NewHTMLParserFromTokenizer(z *html.Tokenizer, processor HTMLProcessor) HTMLParser { return HTMLParser{ z, @@ -45,14 +67,21 @@ func NewHTMLParserFromTokenizer(z *html.Tokenizer, processor HTMLProcessor) HTML } } +// NewHTMLParserFromReader creates a Tokenizer with the given io.Reader and +// then uses that to create a new HTMLParser. func NewHTMLParserFromReader(reader io.Reader, processor HTMLProcessor) HTMLParser { return NewHTMLParserFromTokenizer(html.NewTokenizer(reader), processor) } +// NewHTMLParserFromString creates a Tokenizer with a reader of the given +// string and then uses that to create a new HTMLParser. func NewHTMLParserFromString(html string, processor HTMLProcessor) HTMLParser { return NewHTMLParserFromReader(strings.NewReader(html), processor) } +// SelfClosingTags is the list of tags that always call +// HTMLProcessor.HandleSelfClosingTag() even if it is encountered +// as a html.StartTagToken rather than html.SelfClosingTagToken. var SelfClosingTags = []string{"img", "br", "hr", "area", "base", "basefont", "input", "link", "meta"} func (parser HTMLParser) mapAttrs() map[string]string { @@ -75,6 +104,8 @@ func (parser HTMLParser) isSelfClosing(tag string) bool { return false } +// Process parses the HTML using the tokenizer in this parser and +// calls the appropriate functions of the HTML processor. func (parser HTMLParser) Process() { parser.processor.Preprocess() Loop: @@ -82,7 +113,10 @@ Loop: tt := parser.Next() switch tt { case html.ErrorToken: - parser.processor.ReceiveError(parser.Err()) + if parser.Err() != io.EOF { + parser.processor.ReceiveError(parser.Err()) + return + } break Loop case html.TextToken: parser.processor.HandleText(string(parser.Text())) |