aboutsummaryrefslogtreecommitdiff
path: root/vendor/maunium.net/go/mautrix/format/htmlparser.go
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/maunium.net/go/mautrix/format/htmlparser.go')
-rw-r--r--vendor/maunium.net/go/mautrix/format/htmlparser.go257
1 files changed, 257 insertions, 0 deletions
diff --git a/vendor/maunium.net/go/mautrix/format/htmlparser.go b/vendor/maunium.net/go/mautrix/format/htmlparser.go
new file mode 100644
index 0000000..b681be6
--- /dev/null
+++ b/vendor/maunium.net/go/mautrix/format/htmlparser.go
@@ -0,0 +1,257 @@
+// Copyright 2018 Tulir Asokan
+package format
+
+import (
+ "fmt"
+ "math"
+ "regexp"
+ "strings"
+
+ "golang.org/x/net/html"
+ "strconv"
+)
+
+var MatrixToURL = regexp.MustCompile("^(?:https?://)?(?:www\\.)?matrix\\.to/#/([#@!+].*)(?:/(\\$.+))?")
+
+type TextConverter func(string) string
+
+type HTMLParser struct {
+ PillConverter func(mxid, eventID string) string
+ TabsToSpaces int
+ Newline string
+ BoldConverter TextConverter
+ ItalicConverter TextConverter
+ StrikethroughConverter TextConverter
+ UnderlineConverter TextConverter
+ MonospaceBlockConverter TextConverter
+ MonospaceConverter TextConverter
+}
+
+type TaggedString struct {
+ string
+ tag string
+}
+
+func (parser *HTMLParser) getAttribute(node *html.Node, attribute string) string {
+ for _, attr := range node.Attr {
+ if attr.Key == attribute {
+ return attr.Val
+ }
+ }
+ return ""
+}
+
+func Digits(num int) int {
+ return int(math.Floor(math.Log10(float64(num))) + 1)
+}
+
+func (parser *HTMLParser) listToString(node *html.Node, stripLinebreak bool) string {
+ ordered := node.Data == "ol"
+ taggedChildren := parser.nodeToTaggedStrings(node.FirstChild, stripLinebreak)
+ counter := 1
+ indentLength := 0
+ if ordered {
+ start := parser.getAttribute(node, "start")
+ if len(start) > 0 {
+ counter, _ = strconv.Atoi(start)
+ }
+
+ longestIndex := (counter - 1) + len(taggedChildren)
+ indentLength = Digits(longestIndex)
+ }
+ indent := strings.Repeat(" ", indentLength+2)
+ var children []string
+ for _, child := range taggedChildren {
+ if child.tag != "li" {
+ continue
+ }
+ var prefix string
+ if ordered {
+ indexPadding := indentLength - Digits(counter)
+ prefix = fmt.Sprintf("%d. %s", counter, strings.Repeat(" ", indexPadding))
+ } else {
+ prefix = "● "
+ }
+ str := prefix + child.string
+ counter++
+ parts := strings.Split(str, "\n")
+ for i, part := range parts[1:] {
+ parts[i+1] = indent + part
+ }
+ str = strings.Join(parts, "\n")
+ children = append(children, str)
+ }
+ return strings.Join(children, "\n")
+}
+
+func (parser *HTMLParser) basicFormatToString(node *html.Node, stripLinebreak bool) string {
+ str := parser.nodeToTagAwareString(node.FirstChild, stripLinebreak)
+ switch node.Data {
+ case "b", "strong":
+ if parser.BoldConverter != nil {
+ return parser.BoldConverter(str)
+ }
+ return fmt.Sprintf("**%s**", str)
+ case "i", "em":
+ if parser.ItalicConverter != nil {
+ return parser.ItalicConverter(str)
+ }
+ return fmt.Sprintf("_%s_", str)
+ case "s", "del":
+ if parser.StrikethroughConverter != nil {
+ return parser.StrikethroughConverter(str)
+ }
+ return fmt.Sprintf("~~%s~~", str)
+ case "u", "ins":
+ if parser.UnderlineConverter != nil {
+ return parser.UnderlineConverter(str)
+ }
+ case "tt", "code":
+ if parser.MonospaceConverter != nil {
+ return parser.MonospaceConverter(str)
+ }
+ }
+ return str
+}
+
+func (parser *HTMLParser) headerToString(node *html.Node, stripLinebreak bool) string {
+ children := parser.nodeToStrings(node.FirstChild, stripLinebreak)
+ length := int(node.Data[1] - '0')
+ prefix := strings.Repeat("#", length) + " "
+ return prefix + strings.Join(children, "")
+}
+
+func (parser *HTMLParser) blockquoteToString(node *html.Node, stripLinebreak bool) string {
+ str := parser.nodeToTagAwareString(node.FirstChild, stripLinebreak)
+ childrenArr := strings.Split(strings.TrimSpace(str), "\n")
+ for index, child := range childrenArr {
+ childrenArr[index] = "> " + child
+ }
+ return strings.Join(childrenArr, "\n")
+}
+
+func (parser *HTMLParser) linkToString(node *html.Node, stripLinebreak bool) string {
+ str := parser.nodeToTagAwareString(node.FirstChild, stripLinebreak)
+ href := parser.getAttribute(node, "href")
+ if len(href) == 0 {
+ return str
+ }
+ match := MatrixToURL.FindStringSubmatch(href)
+ if len(match) == 2 || len(match) == 3 {
+ if parser.PillConverter != nil {
+ mxid := match[1]
+ eventID := ""
+ if len(match) == 3 {
+ eventID = match[2]
+ }
+ return parser.PillConverter(mxid, eventID)
+ }
+ return str
+ }
+ return fmt.Sprintf("%s (%s)", str, href)
+}
+
+func (parser *HTMLParser) tagToString(node *html.Node, stripLinebreak bool) string {
+ switch node.Data {
+ case "blockquote":
+ return parser.blockquoteToString(node, stripLinebreak)
+ case "ol", "ul":
+ return parser.listToString(node, stripLinebreak)
+ case "h1", "h2", "h3", "h4", "h5", "h6":
+ return parser.headerToString(node, stripLinebreak)
+ case "br":
+ return parser.Newline
+ case "b", "strong", "i", "em", "s", "del", "u", "ins", "tt", "code":
+ return parser.basicFormatToString(node, stripLinebreak)
+ case "a":
+ return parser.linkToString(node, stripLinebreak)
+ case "p":
+ return parser.nodeToTagAwareString(node.FirstChild, stripLinebreak) + "\n"
+ case "pre":
+ var preStr string
+ if node.FirstChild != nil && node.FirstChild.Type == html.ElementNode && node.FirstChild.Data == "code" {
+ preStr = parser.nodeToString(node.FirstChild.FirstChild, false)
+ } else {
+ preStr = parser.nodeToString(node.FirstChild, false)
+ }
+ if parser.MonospaceBlockConverter != nil {
+ return parser.MonospaceBlockConverter(preStr)
+ }
+ return preStr
+ default:
+ return parser.nodeToTagAwareString(node.FirstChild, stripLinebreak)
+ }
+}
+
+func (parser *HTMLParser) singleNodeToString(node *html.Node, stripLinebreak bool) TaggedString {
+ switch node.Type {
+ case html.TextNode:
+ if stripLinebreak {
+ node.Data = strings.Replace(node.Data, "\n", "", -1)
+ }
+ return TaggedString{node.Data, "text"}
+ case html.ElementNode:
+ return TaggedString{parser.tagToString(node, stripLinebreak), node.Data}
+ case html.DocumentNode:
+ return TaggedString{parser.nodeToTagAwareString(node.FirstChild, stripLinebreak), "html"}
+ default:
+ return TaggedString{"", "unknown"}
+ }
+}
+
+func (parser *HTMLParser) nodeToTaggedStrings(node *html.Node, stripLinebreak bool) (strs []TaggedString) {
+ for ; node != nil; node = node.NextSibling {
+ strs = append(strs, parser.singleNodeToString(node, stripLinebreak))
+ }
+ return
+}
+
+var BlockTags = []string{"p", "h1", "h2", "h3", "h4", "h5", "h6", "ol", "ul", "pre", "blockquote", "div", "hr", "table"}
+
+func (parser *HTMLParser) isBlockTag(tag string) bool {
+ for _, blockTag := range BlockTags {
+ if tag == blockTag {
+ return true
+ }
+ }
+ return false
+}
+
+func (parser *HTMLParser) nodeToTagAwareString(node *html.Node, stripLinebreak bool) string {
+ strs := parser.nodeToTaggedStrings(node, stripLinebreak)
+ var output strings.Builder
+ for _, str := range strs {
+ tstr := str.string
+ if parser.isBlockTag(str.tag) {
+ tstr = fmt.Sprintf("\n%s\n", tstr)
+ }
+ output.WriteString(tstr)
+ }
+ return strings.TrimSpace(output.String())
+}
+
+func (parser *HTMLParser) nodeToStrings(node *html.Node, stripLinebreak bool) (strs []string) {
+ for ; node != nil; node = node.NextSibling {
+ strs = append(strs, parser.singleNodeToString(node, stripLinebreak).string)
+ }
+ return
+}
+
+func (parser *HTMLParser) nodeToString(node *html.Node, stripLinebreak bool) string {
+ return strings.Join(parser.nodeToStrings(node, stripLinebreak), "")
+}
+
+func (parser *HTMLParser) Parse(htmlData string) string {
+ if parser.TabsToSpaces >= 0 {
+ htmlData = strings.Replace(htmlData, "\t", strings.Repeat(" ", parser.TabsToSpaces), -1)
+ }
+ node, _ := html.Parse(strings.NewReader(htmlData))
+ return parser.nodeToTagAwareString(node, true)
+}
+
+func HTMLToText(html string) string {
+ return (&HTMLParser{
+ TabsToSpaces: 4,
+ Newline: "\n",
+ }).Parse(html)
+}