aboutsummaryrefslogtreecommitdiff
path: root/ui/messages/parser
diff options
context:
space:
mode:
authorTulir Asokan <tulir@maunium.net>2018-05-31 16:59:40 +0300
committerTulir Asokan <tulir@maunium.net>2018-05-31 16:59:40 +0300
commit1da02e3a13f9c6b2487378d41d757f3f2610d00a (patch)
tree8fe2d05affa347980b84dceba10e23df6a2f09a7 /ui/messages/parser
parente6043462118b77b41a89c90e26b9ae5a938ffcbd (diff)
Rewrite HTML parser
Diffstat (limited to 'ui/messages/parser')
-rw-r--r--ui/messages/parser/htmlparser.go303
-rw-r--r--ui/messages/parser/htmltagarray.go100
2 files changed, 179 insertions, 224 deletions
diff --git a/ui/messages/parser/htmlparser.go b/ui/messages/parser/htmlparser.go
index cd268a5..b88f55e 100644
--- a/ui/messages/parser/htmlparser.go
+++ b/ui/messages/parser/htmlparser.go
@@ -18,185 +18,240 @@ package parser
import (
"fmt"
- "io"
"math"
"regexp"
"strings"
"maunium.net/go/gomatrix"
- "maunium.net/go/gomuks/debug"
- "maunium.net/go/gomuks/lib/htmlparser"
"maunium.net/go/gomuks/matrix/rooms"
"maunium.net/go/gomuks/ui/messages/tstring"
"maunium.net/go/gomuks/ui/widget"
"maunium.net/go/tcell"
+ "golang.org/x/net/html"
)
var matrixToURL = regexp.MustCompile("^(?:https?://)?(?:www\\.)?matrix\\.to/#/([#@!].*)")
-type MatrixHTMLProcessor struct {
- text tstring.TString
+type htmlParser struct {
+ room *rooms.Room
+}
- senderID string
- sender string
- msgtype string
+type taggedTString struct {
+ tstring.TString
+ tag string
+}
- indent string
- listType string
- lineIsNew bool
- openTags *TagArray
+var AdjustStyleBold = func(style tcell.Style) tcell.Style {
+ return style.Bold(true)
+}
- room *rooms.Room
+var AdjustStyleItalic = func(style tcell.Style) tcell.Style {
+ return style.Italic(true)
+}
+
+var AdjustStyleUnderline = func(style tcell.Style) tcell.Style {
+ return style.Underline(true)
+}
+
+var AdjustStyleStrikethrough = func(style tcell.Style) tcell.Style {
+ return style.Strikethrough(true)
}
-func (parser *MatrixHTMLProcessor) newline() {
- if !parser.lineIsNew {
- parser.text = parser.text.Append("\n" + parser.indent)
- parser.lineIsNew = true
+func (parser *htmlParser) listToTString(node *html.Node, stripLinebreak bool) tstring.TString {
+ ordered := node.Data == "ol"
+ taggedChildren := parser.nodeToTaggedTStrings(node.FirstChild, stripLinebreak)
+ paddingLength := 0
+ if ordered {
+ paddingLength = int(math.Floor(math.Log10(float64(len(taggedChildren)))) + 1)
}
+ padding := strings.Repeat(" ", paddingLength+2)
+ var children []tstring.TString
+ counter := 1
+ for _, child := range taggedChildren {
+ if child.tag != "li" {
+ continue
+ }
+ var prefix string
+ if ordered {
+ prefix = fmt.Sprintf("%*d. ", paddingLength, counter)
+ } else {
+ prefix = "● "
+ }
+ str := child.TString.Prepend(prefix)
+ counter++
+ parts := str.Split('\n')
+ for i, part := range parts[1:] {
+ parts[i+1] = part.Prepend(padding)
+ }
+ str = tstring.Join(parts, "\n")
+ children = append(children, str)
+ }
+ return tstring.Join(children, "\n")
}
-func (parser *MatrixHTMLProcessor) Preprocess() {
- if parser.msgtype == "m.emote" {
- parser.text = tstring.NewColorTString(fmt.Sprintf("* %s ", parser.sender), widget.GetHashColor(parser.senderID))
+func (parser *htmlParser) basicFormatToTString(node *html.Node, stripLinebreak bool) tstring.TString {
+ str := parser.nodeToTagAwareTString(node.FirstChild, stripLinebreak)
+ switch node.Data {
+ case "b", "strong":
+ str.AdjustStyleFull(AdjustStyleBold)
+ case "i", "em":
+ str.AdjustStyleFull(AdjustStyleItalic)
+ case "s", "del":
+ str.AdjustStyleFull(AdjustStyleStrikethrough)
+ case "u", "ins":
+ str.AdjustStyleFull(AdjustStyleUnderline)
}
+ return str
}
-func (parser *MatrixHTMLProcessor) HandleText(text string) {
- style := tcell.StyleDefault
- for _, tag := range *parser.openTags {
- switch tag.Tag {
- case "b", "strong":
- style = style.Bold(true)
- case "i", "em":
- style = style.Italic(true)
- case "s", "del":
- style = style.Strikethrough(true)
- case "u", "ins":
- style = style.Underline(true)
- case "a":
- tag.Text += text
- return
- }
+func (parser *htmlParser) headerToTString(node *html.Node, stripLinebreak bool) tstring.TString {
+ children := parser.nodeToTStrings(node.FirstChild, stripLinebreak)
+ length := int(node.Data[1] - '0')
+ prefix := strings.Repeat("#", length) + " "
+ return tstring.Join(children, "").Prepend(prefix)
+}
+
+func (parser *htmlParser) blockquoteToTString(node *html.Node, stripLinebreak bool) tstring.TString {
+ str := parser.nodeToTagAwareTString(node.FirstChild, stripLinebreak)
+ childrenArr := str.TrimSpace().Split('\n')
+ for index, child := range childrenArr {
+ childrenArr[index] = child.Prepend("> ")
}
+ return tstring.Join(childrenArr, "\n")
+}
- if !parser.openTags.Has("pre", "code") {
- text = strings.Replace(text, "\n", "", -1)
+func (parser *htmlParser) linkToTString(node *html.Node, stripLinebreak bool) tstring.TString {
+ str := parser.nodeToTagAwareTString(node.FirstChild, stripLinebreak)
+ var href string
+ for _, attr := range node.Attr {
+ if attr.Key == "href" {
+ href = attr.Val
+ break
+ }
+ }
+ if len(href) == 0 {
+ return str
+ }
+ match := matrixToURL.FindStringSubmatch(href)
+ if len(match) == 2 {
+ pillTarget := match[1]
+ if pillTarget[0] == '@' {
+ if member := parser.room.GetMember(pillTarget); member != nil {
+ return tstring.NewColorTString(member.DisplayName, widget.GetHashColor(member.UserID))
+ }
+ }
+ return tstring.NewTString(pillTarget)
}
- parser.text = parser.text.AppendStyle(text, style)
- parser.lineIsNew = false
+ return str.Append(fmt.Sprintf(" (%s)", href))
}
-func (parser *MatrixHTMLProcessor) HandleStartTag(tagName string, attrs map[string]string) {
- tag := &TagWithMeta{Tag: tagName}
- switch tag.Tag {
+func (parser *htmlParser) tagToTString(node *html.Node, stripLinebreak bool) tstring.TString {
+ switch node.Data {
+ case "blockquote":
+ return parser.blockquoteToTString(node, stripLinebreak)
+ case "ol", "ul":
+ return parser.listToTString(node, stripLinebreak)
case "h1", "h2", "h3", "h4", "h5", "h6":
- length := int(tag.Tag[1] - '0')
- parser.text = parser.text.Append(strings.Repeat("#", length) + " ")
- parser.lineIsNew = false
+ return parser.headerToTString(node, stripLinebreak)
+ case "br":
+ return tstring.NewTString("\n")
+ case "b", "strong", "i", "em", "s", "del", "u", "ins":
+ return parser.basicFormatToTString(node, stripLinebreak)
case "a":
- tag.Meta, _ = attrs["href"]
- case "ol", "ul":
- parser.listType = tag.Tag
- case "li":
- indentSize := 2
- if parser.listType == "ol" {
- list := parser.openTags.Get(parser.listType)
- list.Counter++
- parser.text = parser.text.Append(fmt.Sprintf("%d. ", list.Counter))
- indentSize = int(math.Log10(float64(list.Counter))+1) + len(". ")
- } else {
- parser.text = parser.text.Append("* ")
- }
- parser.indent += strings.Repeat(" ", indentSize)
- parser.lineIsNew = false
- case "blockquote":
- parser.indent += "> "
- parser.text = parser.text.Append("> ")
- parser.lineIsNew = false
+ return parser.linkToTString(node, stripLinebreak)
+ case "p":
+ return parser.nodeToTagAwareTString(node.FirstChild, stripLinebreak).Append("\n")
+ case "pre":
+ return parser.nodeToTString(node.FirstChild, false)
+ default:
+ return parser.nodeToTagAwareTString(node.FirstChild, stripLinebreak)
}
- parser.openTags.PushMeta(tag)
}
-func (parser *MatrixHTMLProcessor) HandleSelfClosingTag(tagName string, attrs map[string]string) {
- if tagName == "br" {
- parser.newline()
+func (parser *htmlParser) singleNodeToTString(node *html.Node, stripLinebreak bool) taggedTString {
+ switch node.Type {
+ case html.TextNode:
+ if stripLinebreak {
+ node.Data = strings.Replace(node.Data, "\n", "", -1)
+ }
+ return taggedTString{tstring.NewTString(node.Data), "text"}
+ case html.ElementNode:
+ return taggedTString{parser.tagToTString(node, stripLinebreak), node.Data}
+ case html.DocumentNode:
+ return taggedTString{parser.nodeToTagAwareTString(node.FirstChild, stripLinebreak), "html"}
+ default:
+ return taggedTString{tstring.NewBlankTString(), "unknown"}
}
}
-func (parser *MatrixHTMLProcessor) HandleEndTag(tagName string) {
- tag := parser.openTags.Pop(tagName)
- if tag == nil {
- return
+func (parser *htmlParser) nodeToTaggedTStrings(node *html.Node, stripLinebreak bool) (strs []taggedTString) {
+ for ; node != nil; node = node.NextSibling {
+ strs = append(strs, parser.singleNodeToTString(node, stripLinebreak))
}
+ return
+}
+
+var BlockTags = []string{"p", "h1", "h2", "h3", "h4", "h5", "h6", "ol", "ul", "pre", "blockquote", "div", "hr", "table"}
- switch tag.Tag {
- case "li", "blockquote":
- indentSize := 2
- if tag.Tag == "li" && parser.listType == "ol" {
- list := parser.openTags.Get(parser.listType)
- indentSize = int(math.Log10(float64(list.Counter))+1) + len(". ")
+func (parser *htmlParser) isBlockTag(tag string) bool {
+ for _, blockTag := range BlockTags {
+ if tag == blockTag {
+ return true
}
- if len(parser.indent) >= indentSize {
- parser.indent = parser.indent[0 : len(parser.indent)-indentSize]
+ }
+ return false
+}
+
+func (parser *htmlParser) nodeToTagAwareTString(node *html.Node, stripLinebreak bool) tstring.TString {
+ strs := parser.nodeToTaggedTStrings(node, stripLinebreak)
+ output := tstring.NewBlankTString()
+ for i, str := range strs {
+ tstr := str.TString
+ curIsBlock := parser.isBlockTag(str.tag)
+ if i > 0 && curIsBlock {
+ tstr = tstr.Prepend("\n")
}
- // TODO this newline is sometimes not good
- parser.newline()
- case "a":
- match := matrixToURL.FindStringSubmatch(tag.Meta)
- if len(match) == 2 {
- pillTarget := match[1]
- if pillTarget[0] == '@' {
- if member := parser.room.GetMember(pillTarget); member != nil {
- parser.text = parser.text.AppendColor(member.DisplayName, widget.GetHashColor(member.UserID))
- } else {
- parser.text = parser.text.Append(pillTarget)
- }
- } else {
- parser.text = parser.text.Append(pillTarget)
- }
- } else {
- // TODO make text clickable rather than printing URL
- parser.text = parser.text.Append(fmt.Sprintf("%s (%s)", tag.Text, tag.Meta))
+ if curIsBlock && len(strs) < i+1 {
+ tstr = tstr.Append("\n")
}
- parser.lineIsNew = false
- case "p", "pre", "ol", "ul", "h1", "h2", "h3", "h4", "h5", "h6", "div":
- // parser.newline()
+ output = output.AppendTString(tstr)
}
+ return output.TrimSpace()
}
-func (parser *MatrixHTMLProcessor) ReceiveError(err error) {
- if err != io.EOF {
- debug.Print("Unexpected error parsing HTML:", err)
+func (parser *htmlParser) nodeToTStrings(node *html.Node, stripLinebreak bool) (strs []tstring.TString) {
+ for ; node != nil; node = node.NextSibling {
+ strs = append(strs, parser.singleNodeToTString(node, stripLinebreak).TString)
}
+ return
}
-func (parser *MatrixHTMLProcessor) Postprocess() {
- if len(parser.text) > 0 && parser.text[len(parser.text)-1].Char == '\n' {
- parser.text = parser.text[:len(parser.text)-1]
- }
+func (parser *htmlParser) nodeToTString(node *html.Node, stripLinebreak bool) tstring.TString {
+ return tstring.Join(parser.nodeToTStrings(node, stripLinebreak), "")
+}
+
+func (parser *htmlParser) Parse(htmlData string) tstring.TString {
+ node, _ := html.Parse(strings.NewReader(htmlData))
+ return parser.nodeToTagAwareTString(node, true)
}
// ParseHTMLMessage parses a HTML-formatted Matrix event into a UIMessage.
func ParseHTMLMessage(room *rooms.Room, evt *gomatrix.Event, senderDisplayname string) tstring.TString {
htmlData, _ := evt.Content["formatted_body"].(string)
htmlData = strings.Replace(htmlData, "\t", " ", -1)
- msgtype, _ := evt.Content["msgtype"].(string)
- processor := &MatrixHTMLProcessor{
- room: room,
- text: tstring.NewBlankTString(),
- msgtype: msgtype,
- senderID: evt.Sender,
- sender: senderDisplayname,
- indent: "",
- listType: "",
- lineIsNew: true,
- openTags: &TagArray{},
- }
+ parser := htmlParser{room}
+ str := parser.Parse(htmlData)
- parser := htmlparser.NewHTMLParserFromString(htmlData, processor)
- parser.Process()
+ msgtype, _ := evt.Content["msgtype"].(string)
+ if msgtype == "m.emote" {
+ str = tstring.Join([]tstring.TString{
+ tstring.NewTString("* "),
+ tstring.NewColorTString(senderDisplayname, widget.GetHashColor(evt.Sender)),
+ tstring.NewTString(" "),
+ str,
+ }, "")
+ }
- return processor.text
+ return str
}
diff --git a/ui/messages/parser/htmltagarray.go b/ui/messages/parser/htmltagarray.go
deleted file mode 100644
index 464caa9..0000000
--- a/ui/messages/parser/htmltagarray.go
+++ /dev/null
@@ -1,100 +0,0 @@
-// gomuks - A terminal Matrix client written in Go.
-// Copyright (C) 2018 Tulir Asokan
-//
-// This program is free software: you can redistribute it and/or modify
-// it under the terms of the GNU General Public License as published by
-// the Free Software Foundation, either version 3 of the License, or
-// (at your option) any later version.
-//
-// This program is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-// GNU General Public License for more details.
-//
-// You should have received a copy of the GNU General Public License
-// along with this program. If not, see <http://www.gnu.org/licenses/>.
-
-package parser
-
-// TagWithMeta is an open HTML tag with some metadata (e.g. list index, a href value).
-type TagWithMeta struct {
- Tag string
- Counter int
- Meta string
- Text string
-}
-
-// BlankTag is a blank TagWithMeta object.
-var BlankTag = &TagWithMeta{}
-
-// TagArray is a reversed queue for remembering what HTML tags are open.
-type TagArray []*TagWithMeta
-
-// Push adds the given tag to the array.
-func (ta *TagArray) Push(tag string) {
- ta.PushMeta(&TagWithMeta{Tag: tag})
-}
-
-// PushMeta adds the given tag to the array.
-func (ta *TagArray) PushMeta(tag *TagWithMeta) {
- *ta = append(*ta, BlankTag)
- copy((*ta)[1:], *ta)
- (*ta)[0] = tag
-}
-
-// Pop removes the given tag from the array.
-func (ta *TagArray) Pop(tag string) (removed *TagWithMeta) {
- if len(*ta) == 0 {
- return
- } else if (*ta)[0].Tag == tag {
- // This is the default case and is lighter than append(), so we handle it separately.
- removed = (*ta)[0]
- *ta = (*ta)[1:]
- } else if index := ta.Index(tag); index != -1 {
- removed = (*ta)[index]
- *ta = append((*ta)[:index], (*ta)[index+1:]...)
- }
- return
-}
-
-// Index returns the first index where the given tag is, or -1 if it's not in the list.
-func (ta *TagArray) Index(tag string) int {
- return ta.IndexAfter(tag, -1)
-}
-
-// IndexAfter returns the first index after the given index where the given tag is,
-// or -1 if the given tag is not on the list after the given index.
-func (ta *TagArray) IndexAfter(tag string, after int) int {
- for i := after + 1; i < len(*ta); i++ {
- if (*ta)[i].Tag == tag {
- return i
- }
- }
- return -1
-}
-
-// Get returns the first occurrence of the given tag, or nil if it's not in the list.
-func (ta *TagArray) Get(tag string) *TagWithMeta {
- return ta.GetAfter(tag, -1)
-}
-
-// GetAfter returns the first occurrence of the given tag, or nil if the given
-// tag is not on the list after the given index.
-func (ta *TagArray) GetAfter(tag string, after int) *TagWithMeta {
- for i := after + 1; i < len(*ta); i++ {
- if (*ta)[i].Tag == tag {
- return (*ta)[i]
- }
- }
- return nil
-}
-
-// Has returns whether or not the list has at least one of the given tags.
-func (ta *TagArray) Has(tags ...string) bool {
- for _, tag := range tags {
- if index := ta.Index(tag); index != -1 {
- return true
- }
- }
- return false
-}