From 782ba0657a0bddc6ccb31b1792f3fbf4500a0087 Mon Sep 17 00:00:00 2001 From: Tulir Asokan Date: Sat, 14 Apr 2018 11:44:07 +0300 Subject: Make HTML rendering more advanced Also add Python-like HTML parser thing in lib/htmlparser --- lib/htmlparser/htmlparser.go | 108 +++++++++++++++++++++ ui/messages/htmlparser.go | 218 ++++++++++++++++++++++++++---------------- ui/messages/htmltagarray.go | 118 +++++++++++++++++++++++ ui/messages/parser.go | 6 +- ui/messages/tstring/string.go | 4 + 5 files changed, 367 insertions(+), 87 deletions(-) create mode 100644 lib/htmlparser/htmlparser.go create mode 100644 ui/messages/htmltagarray.go diff --git a/lib/htmlparser/htmlparser.go b/lib/htmlparser/htmlparser.go new file mode 100644 index 0000000..fb4e012 --- /dev/null +++ b/lib/htmlparser/htmlparser.go @@ -0,0 +1,108 @@ +// gomuks - A terminal Matrix client written in Go. +// Copyright (C) 2018 Tulir Asokan +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see . + +package htmlparser + +import ( + "io" + "strings" + + "golang.org/x/net/html" +) + +type HTMLProcessor interface { + Preprocess() + HandleStartTag(tagName string, attrs map[string]string) + HandleSelfClosingTag(tagName string, attrs map[string]string) + HandleText(text string) + HandleEndTag(tagName string) + ReceiveError(err error) + Postprocess() +} + +type HTMLParser struct { + *html.Tokenizer + processor HTMLProcessor +} + +func NewHTMLParserFromTokenizer(z *html.Tokenizer, processor HTMLProcessor) HTMLParser { + return HTMLParser{ + z, + processor, + } +} + +func NewHTMLParserFromReader(reader io.Reader, processor HTMLProcessor) HTMLParser { + return NewHTMLParserFromTokenizer(html.NewTokenizer(reader), processor) +} + +func NewHTMLParserFromString(html string, processor HTMLProcessor) HTMLParser { + return NewHTMLParserFromReader(strings.NewReader(html), processor) +} + +var SelfClosingTags = []string{"img", "br", "hr", "area", "base", "basefont", "input", "link", "meta"} + +func (parser HTMLParser) mapAttrs() map[string]string { + attrs := make(map[string]string) + hasMore := true + for hasMore { + var key, val []byte + key, val, hasMore = parser.TagAttr() + attrs[string(key)] = string(val) + } + return attrs +} + +func (parser HTMLParser) isSelfClosing(tag string) bool { + for _, selfClosingTag := range SelfClosingTags { + if tag == selfClosingTag { + return true + } + } + return false +} + +func (parser HTMLParser) Process() { + parser.processor.Preprocess() +Loop: + for { + tt := parser.Next() + switch tt { + case html.ErrorToken: + parser.processor.ReceiveError(parser.Err()) + break Loop + case html.TextToken: + parser.processor.HandleText(string(parser.Text())) + case html.StartTagToken, html.SelfClosingTagToken: + tagb, _ := parser.TagName() + attrs := parser.mapAttrs() + tag := string(tagb) + + selfClosing := tt == html.SelfClosingTagToken || parser.isSelfClosing(tag) + + if selfClosing { + parser.processor.HandleSelfClosingTag(tag, attrs) + } else { + parser.processor.HandleStartTag(tag, attrs) + } + case html.EndTagToken: + tagb, _ := parser.TagName() + parser.processor.HandleEndTag(string(tagb)) + } + } + + parser.processor.Postprocess() +} diff --git a/ui/messages/htmlparser.go b/ui/messages/htmlparser.go index 0475e7a..aa6211e 100644 --- a/ui/messages/htmlparser.go +++ b/ui/messages/htmlparser.go @@ -17,120 +17,170 @@ package messages import ( + "fmt" + "io" + "math" + "regexp" "strings" - "golang.org/x/net/html" "maunium.net/go/gomatrix" "maunium.net/go/gomuks/debug" + "maunium.net/go/gomuks/lib/htmlparser" + "maunium.net/go/gomuks/matrix/rooms" "maunium.net/go/gomuks/ui/messages/tstring" + "maunium.net/go/gomuks/ui/widget" "maunium.net/go/tcell" ) -// TagArray is a reversed queue for remembering what HTML tags are open. -type TagArray []string +var matrixToURL = regexp.MustCompile("^(?:https?://)?(?:www\\.)?matrix\\.to/#/([#@!].*)") -// Pushb converts the given byte array into a string and calls Push(). -func (ta *TagArray) Pushb(tag []byte) { - ta.Push(string(tag)) +type MatrixHTMLProcessor struct { + text tstring.TString + + indent string + listType string + lineIsNew bool + openTags *TagArray + + room *rooms.Room } -// Popb converts the given byte array into a string and calls Pop(). -func (ta *TagArray) Popb(tag []byte) { - ta.Pop(string(tag)) +func (parser *MatrixHTMLProcessor) newline() { + if !parser.lineIsNew { + parser.text = parser.text.Append("\n" + parser.indent) + parser.lineIsNew = true + } } -// Hasb converts the given byte array into a string and calls Has(). -func (ta *TagArray) Hasb(tag []byte) { - ta.Has(string(tag)) +func (parser *MatrixHTMLProcessor) Preprocess() {} + +func (parser *MatrixHTMLProcessor) HandleText(text string) { + style := tcell.StyleDefault + for _, tag := range *parser.openTags { + switch tag.Tag { + case "b", "strong": + style = style.Bold(true) + case "i", "em": + style = style.Italic(true) + case "s", "del": + style = style.Strikethrough(true) + case "u", "ins": + style = style.Underline(true) + case "a": + tag.Text += text + return + } + } + + if parser.openTags.Has("pre", "code") { + text = strings.Replace(text, "\n", "", -1) + } + parser.text = parser.text.AppendStyle(text, style) + parser.lineIsNew = false } -// HasAfterb converts the given byte array into a string and calls HasAfter(). -func (ta *TagArray) HasAfterb(tag []byte, after int) { - ta.HasAfter(string(tag), after) +func (parser *MatrixHTMLProcessor) HandleStartTag(tagName string, attrs map[string]string) { + tag := &TagWithMeta{Tag: tagName} + switch tag.Tag { + case "h1", "h2", "h3", "h4", "h5", "h6": + length := int(tag.Tag[1] - '0') + parser.text = parser.text.Append(strings.Repeat("#", length) + " ") + parser.lineIsNew = false + case "a": + tag.Meta, _ = attrs["href"] + case "ol", "ul": + parser.listType = tag.Tag + case "li": + indentSize := 2 + if parser.listType == "ol" { + list := parser.openTags.Get(parser.listType) + list.Counter++ + parser.text = parser.text.Append(fmt.Sprintf("%d. ", list.Counter)) + indentSize = int(math.Log10(float64(list.Counter))+1) + len(". ") + } else { + parser.text = parser.text.Append("* ") + } + parser.indent += strings.Repeat(" ", indentSize) + parser.lineIsNew = false + case "blockquote": + parser.indent += "> " + parser.text = parser.text.Append("> ") + parser.lineIsNew = false + } + parser.openTags.PushMeta(tag) } -// Push adds the given tag to the array. -func (ta *TagArray) Push(tag string) { - *ta = append(*ta, "") - copy((*ta)[1:], *ta) - (*ta)[0] = tag +func (parser *MatrixHTMLProcessor) HandleSelfClosingTag(tagName string, attrs map[string]string) { + if tagName == "br" { + parser.newline() + } } -// Pop removes the given tag from the array. -func (ta *TagArray) Pop(tag string) { - if (*ta)[0] == tag { - // This is the default case and is lighter than append(), so we handle it separately. - *ta = (*ta)[1:] - } else if index := ta.Has(tag); index != -1 { - *ta = append((*ta)[:index], (*ta)[index+1:]...) +func (parser *MatrixHTMLProcessor) HandleEndTag(tagName string) { + tag := parser.openTags.Pop(tagName) + + switch tag.Tag { + case "li", "blockquote": + indentSize := 2 + if tag.Tag == "li" && parser.listType == "ol" { + list := parser.openTags.Get(parser.listType) + indentSize = int(math.Log10(float64(list.Counter))+1) + len(". ") + } + if len(parser.indent) >= indentSize { + parser.indent = parser.indent[0 : len(parser.indent)-indentSize] + } + // TODO this newline is sometimes not good + parser.newline() + case "a": + match := matrixToURL.FindStringSubmatch(tag.Meta) + if len(match) == 2 { + pillTarget := match[1] + if pillTarget[0] == '@' { + if member := parser.room.GetMember(pillTarget); member != nil { + parser.text = parser.text.AppendColor(member.DisplayName, widget.GetHashColor(member.DisplayName)) + } else { + parser.text = parser.text.Append(pillTarget) + } + } else { + parser.text = parser.text.Append(pillTarget) + } + } else { + // TODO make text clickable rather than printing URL + parser.text = parser.text.Append(fmt.Sprintf("%s (%s)", tag.Text, tag.Meta)) + } + parser.lineIsNew = false + case "p", "pre", "ol", "ul", "h1", "h2", "h3", "h4", "h5", "h6", "div": + // parser.newline() } } -// Has returns the first index where the given tag is, or -1 if it's not in the list. -func (ta *TagArray) Has(tag string) int { - return ta.HasAfter(tag, -1) +func (parser *MatrixHTMLProcessor) ReceiveError(err error) { + if err != io.EOF { + debug.Print("Unexpected error parsing HTML:", err) + } } -// HasAfter returns the first index after the given index where the given tag is, -// or -1 if the given tag is not on the list after the given index. -func (ta *TagArray) HasAfter(tag string, after int) int { - for i := after + 1; i < len(*ta); i++ { - if (*ta)[i] == tag { - return i - } +func (parser *MatrixHTMLProcessor) Postprocess() { + if len(parser.text) > 0 && parser.text[len(parser.text)-1].Char == '\n' { + parser.text = parser.text[:len(parser.text)-1] } - return -1 } // ParseHTMLMessage parses a HTML-formatted Matrix event into a UIMessage. -func ParseHTMLMessage(evt *gomatrix.Event) tstring.TString { - //textData, _ := evt.Content["body"].(string) +func ParseHTMLMessage(room *rooms.Room, evt *gomatrix.Event) tstring.TString { htmlData, _ := evt.Content["formatted_body"].(string) - z := html.NewTokenizer(strings.NewReader(htmlData)) - text := tstring.NewTString("") - - openTags := &TagArray{} - -Loop: - for { - tt := z.Next() - switch tt { - case html.ErrorToken: - break Loop - case html.TextToken: - style := tcell.StyleDefault - for _, tag := range *openTags { - switch tag { - case "b", "strong": - style = style.Bold(true) - case "i", "em": - style = style.Italic(true) - case "s", "del": - style = style.Strikethrough(true) - case "u", "ins": - style = style.Underline(true) - } - } - text = text.AppendStyle(string(z.Text()), style) - case html.SelfClosingTagToken, html.StartTagToken: - tagb, _ := z.TagName() - tag := string(tagb) - switch tag { - case "br": - debug.Print("BR found") - debug.Print(text.String()) - text = text.Append("\n") - default: - if tt == html.StartTagToken { - openTags.Push(tag) - } - } - case html.EndTagToken: - tagb, _ := z.TagName() - openTags.Popb(tagb) - } + processor := &MatrixHTMLProcessor{ + room: room, + text: tstring.NewBlankTString(), + indent: "", + listType: "", + lineIsNew: true, + openTags: &TagArray{}, } - return text + parser := htmlparser.NewHTMLParserFromString(htmlData, processor) + parser.Process() + + return processor.text } diff --git a/ui/messages/htmltagarray.go b/ui/messages/htmltagarray.go new file mode 100644 index 0000000..597f0c7 --- /dev/null +++ b/ui/messages/htmltagarray.go @@ -0,0 +1,118 @@ +// gomuks - A terminal Matrix client written in Go. +// Copyright (C) 2018 Tulir Asokan +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see . + +package messages + +// TagWithMeta is an open HTML tag with some metadata (e.g. list index, a href value). +type TagWithMeta struct { + Tag string + Counter int + Meta string + Text string +} + +// BlankTag is a blank TagWithMeta object. +var BlankTag = &TagWithMeta{} + +// TagArray is a reversed queue for remembering what HTML tags are open. +type TagArray []*TagWithMeta + +// Pushb converts the given byte array into a string and calls Push(). +func (ta *TagArray) Pushb(tag []byte) { + ta.Push(string(tag)) +} + +// Popb converts the given byte array into a string and calls Pop(). +func (ta *TagArray) Popb(tag []byte) *TagWithMeta { + return ta.Pop(string(tag)) +} + +// Indexb converts the given byte array into a string and calls Index(). +func (ta *TagArray) Indexb(tag []byte) { + ta.Index(string(tag)) +} + +// IndexAfterb converts the given byte array into a string and calls IndexAfter(). +func (ta *TagArray) IndexAfterb(tag []byte, after int) { + ta.IndexAfter(string(tag), after) +} + +// Push adds the given tag to the array. +func (ta *TagArray) Push(tag string) { + ta.PushMeta(&TagWithMeta{Tag: tag}) +} + +// Push adds the given tag to the array. +func (ta *TagArray) PushMeta(tag *TagWithMeta) { + *ta = append(*ta, BlankTag) + copy((*ta)[1:], *ta) + (*ta)[0] = tag +} + +// Pop removes the given tag from the array. +func (ta *TagArray) Pop(tag string) (removed *TagWithMeta) { + if (*ta)[0].Tag == tag { + // This is the default case and is lighter than append(), so we handle it separately. + removed = (*ta)[0] + *ta = (*ta)[1:] + } else if index := ta.Index(tag); index != -1 { + removed = (*ta)[index] + *ta = append((*ta)[:index], (*ta)[index+1:]...) + } + return +} + +// Index returns the first index where the given tag is, or -1 if it's not in the list. +func (ta *TagArray) Index(tag string) int { + return ta.IndexAfter(tag, -1) +} + +// IndexAfter returns the first index after the given index where the given tag is, +// or -1 if the given tag is not on the list after the given index. +func (ta *TagArray) IndexAfter(tag string, after int) int { + for i := after + 1; i < len(*ta); i++ { + if (*ta)[i].Tag == tag { + return i + } + } + return -1 +} + +// Get returns the first occurrence of the given tag, or nil if it's not in the list. +func (ta *TagArray) Get(tag string) *TagWithMeta { + return ta.GetAfter(tag, -1) +} + +// IndexAfter returns the first occurrence of the given tag, or nil if the given +// tag is not on the list after the given index. +func (ta *TagArray) GetAfter(tag string, after int) *TagWithMeta { + for i := after + 1; i < len(*ta); i++ { + if (*ta)[i].Tag == tag { + return (*ta)[i] + } + } + return nil +} + +// Has returns whether or not the list has at least one of the given tags. +func (ta *TagArray) Has(tags ...string) bool { + for _, tag := range tags { + if index := ta.Index(tag); index != -1 { + return true + } + } + return false +} diff --git a/ui/messages/parser.go b/ui/messages/parser.go index d8069c6..80ce5d6 100644 --- a/ui/messages/parser.go +++ b/ui/messages/parser.go @@ -36,7 +36,7 @@ func ParseEvent(gmx ifc.Gomuks, room *rooms.Room, evt *gomatrix.Event) UIMessage } switch evt.Type { case "m.room.message": - return ParseMessage(gmx, evt) + return ParseMessage(gmx, room, evt) case "m.room.member": return ParseMembershipEvent(evt) } @@ -51,14 +51,14 @@ func unixToTime(unix int64) time.Time { return timestamp } -func ParseMessage(gmx ifc.Gomuks, evt *gomatrix.Event) UIMessage { +func ParseMessage(gmx ifc.Gomuks, room *rooms.Room, evt *gomatrix.Event) UIMessage { msgtype, _ := evt.Content["msgtype"].(string) ts := unixToTime(evt.Timestamp) switch msgtype { case "m.text", "m.notice", "m.emote": format, hasFormat := evt.Content["format"].(string) if hasFormat && format == "org.matrix.custom.html" { - text := ParseHTMLMessage(evt) + text := ParseHTMLMessage(room, evt) return NewExpandedTextMessage(evt.ID, evt.Sender, msgtype, text, ts) } else { text, _ := evt.Content["body"].(string) diff --git a/ui/messages/tstring/string.go b/ui/messages/tstring/string.go index d1ad446..a87d16a 100644 --- a/ui/messages/tstring/string.go +++ b/ui/messages/tstring/string.go @@ -25,6 +25,10 @@ import ( type TString []Cell +func NewBlankTString() TString { + return make([]Cell, 0) +} + func NewTString(str string) TString { newStr := make([]Cell, len(str)) for i, char := range str { -- cgit v1.2.3