diff options
author | Tulir Asokan <tulir@maunium.net> | 2018-04-14 18:09:02 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2018-04-14 18:09:02 +0300 |
commit | 53cdfb64c1773b63fb9432a1525b4ac4acb154fc (patch) | |
tree | 1c1aa180313abe8179d0b07591348e222b60483e /lib/htmlparser | |
parent | 14a84295d72a24a8bce8a71c240ab2b155ed5a1f (diff) | |
parent | d060d10615434c557373ee00ba009cc8b583e881 (diff) |
Merge pull request #18 from tulir/ui-refactor
Refactor UI to use interfaces and add advanced message rendering
Diffstat (limited to 'lib/htmlparser')
-rw-r--r-- | lib/htmlparser/doc.go | 3 | ||||
-rw-r--r-- | lib/htmlparser/htmlparser.go | 142 |
2 files changed, 145 insertions, 0 deletions
diff --git a/lib/htmlparser/doc.go b/lib/htmlparser/doc.go new file mode 100644 index 0000000..0e31960 --- /dev/null +++ b/lib/htmlparser/doc.go @@ -0,0 +1,3 @@ +// Package htmlparser contains a HTML parsing system similar to html.parser.HTMLParser in Python 3. +// The parser uses x/net/html.Tokenizer in the background. +package htmlparser diff --git a/lib/htmlparser/htmlparser.go b/lib/htmlparser/htmlparser.go new file mode 100644 index 0000000..5ef8d98 --- /dev/null +++ b/lib/htmlparser/htmlparser.go @@ -0,0 +1,142 @@ +// gomuks - A terminal Matrix client written in Go. +// Copyright (C) 2018 Tulir Asokan +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see <http://www.gnu.org/licenses/>. + +package htmlparser + +import ( + "io" + "strings" + + "golang.org/x/net/html" +) + +// HTMLProcessor contains the functions to process parsed HTML data. +type HTMLProcessor interface { + // Preprocess is called before the parsing is started. + Preprocess() + + // HandleStartTag is called with the tag name and attributes when + // the parser encounters a StartTagToken, except if the tag is + // always self-closing. + HandleStartTag(tagName string, attrs map[string]string) + // HandleSelfClosingTag is called with the tag name and attributes + // when the parser encounters a SelfClosingTagToken OR a StartTagToken + // with a tag that's always self-closing. + HandleSelfClosingTag(tagName string, attrs map[string]string) + // HandleText is called with the text when the parser encounters + // a TextToken. + HandleText(text string) + // HandleEndTag is called with the tag name when the parser encounters + // an EndTagToken. + HandleEndTag(tagName string) + + // ReceiveError is called with the error when the parser encounters + // an ErrorToken that IS NOT io.EOF. + ReceiveError(err error) + + // Postprocess is called after parsing is completed successfully. + // An unsuccessful parsing will trigger a ReceiveError() call. + Postprocess() +} + +// HTMLParser wraps a net/html.Tokenizer and a HTMLProcessor to call +// the HTMLProcessor with data from the Tokenizer. +type HTMLParser struct { + *html.Tokenizer + processor HTMLProcessor +} + +// NewHTMLParserFromTokenizer creates a new HTMLParser from an existing html Tokenizer. +func NewHTMLParserFromTokenizer(z *html.Tokenizer, processor HTMLProcessor) HTMLParser { + return HTMLParser{ + z, + processor, + } +} + +// NewHTMLParserFromReader creates a Tokenizer with the given io.Reader and +// then uses that to create a new HTMLParser. +func NewHTMLParserFromReader(reader io.Reader, processor HTMLProcessor) HTMLParser { + return NewHTMLParserFromTokenizer(html.NewTokenizer(reader), processor) +} + +// NewHTMLParserFromString creates a Tokenizer with a reader of the given +// string and then uses that to create a new HTMLParser. +func NewHTMLParserFromString(html string, processor HTMLProcessor) HTMLParser { + return NewHTMLParserFromReader(strings.NewReader(html), processor) +} + +// SelfClosingTags is the list of tags that always call +// HTMLProcessor.HandleSelfClosingTag() even if it is encountered +// as a html.StartTagToken rather than html.SelfClosingTagToken. +var SelfClosingTags = []string{"img", "br", "hr", "area", "base", "basefont", "input", "link", "meta"} + +func (parser HTMLParser) mapAttrs() map[string]string { + attrs := make(map[string]string) + hasMore := true + for hasMore { + var key, val []byte + key, val, hasMore = parser.TagAttr() + attrs[string(key)] = string(val) + } + return attrs +} + +func (parser HTMLParser) isSelfClosing(tag string) bool { + for _, selfClosingTag := range SelfClosingTags { + if tag == selfClosingTag { + return true + } + } + return false +} + +// Process parses the HTML using the tokenizer in this parser and +// calls the appropriate functions of the HTML processor. +func (parser HTMLParser) Process() { + parser.processor.Preprocess() +Loop: + for { + tt := parser.Next() + switch tt { + case html.ErrorToken: + if parser.Err() != io.EOF { + parser.processor.ReceiveError(parser.Err()) + return + } + break Loop + case html.TextToken: + parser.processor.HandleText(string(parser.Text())) + case html.StartTagToken, html.SelfClosingTagToken: + tagb, _ := parser.TagName() + attrs := parser.mapAttrs() + tag := string(tagb) + + selfClosing := tt == html.SelfClosingTagToken || parser.isSelfClosing(tag) + + if selfClosing { + parser.processor.HandleSelfClosingTag(tag, attrs) + } else { + parser.processor.HandleStartTag(tag, attrs) + } + case html.EndTagToken: + tagb, _ := parser.TagName() + parser.processor.HandleEndTag(string(tagb)) + } + } + + parser.processor.Postprocess() +} |