// Copyright 2018 Tulir Asokan package format import ( "fmt" "math" "regexp" "strings" "golang.org/x/net/html" "strconv" ) var MatrixToURL = regexp.MustCompile("^(?:https?://)?(?:www\\.)?matrix\\.to/#/([#@!+].*)(?:/(\\$.+))?") type TextConverter func(string) string type HTMLParser struct { PillConverter func(mxid, eventID string) string TabsToSpaces int Newline string BoldConverter TextConverter ItalicConverter TextConverter StrikethroughConverter TextConverter UnderlineConverter TextConverter MonospaceBlockConverter TextConverter MonospaceConverter TextConverter } type TaggedString struct { string tag string } func (parser *HTMLParser) getAttribute(node *html.Node, attribute string) string { for _, attr := range node.Attr { if attr.Key == attribute { return attr.Val } } return "" } func Digits(num int) int { return int(math.Floor(math.Log10(float64(num))) + 1) } func (parser *HTMLParser) listToString(node *html.Node, stripLinebreak bool) string { ordered := node.Data == "ol" taggedChildren := parser.nodeToTaggedStrings(node.FirstChild, stripLinebreak) counter := 1 indentLength := 0 if ordered { start := parser.getAttribute(node, "start") if len(start) > 0 { counter, _ = strconv.Atoi(start) } longestIndex := (counter - 1) + len(taggedChildren) indentLength = Digits(longestIndex) } indent := strings.Repeat(" ", indentLength+2) var children []string for _, child := range taggedChildren { if child.tag != "li" { continue } var prefix string if ordered { indexPadding := indentLength - Digits(counter) prefix = fmt.Sprintf("%d. %s", counter, strings.Repeat(" ", indexPadding)) } else { prefix = "● " } str := prefix + child.string counter++ parts := strings.Split(str, "\n") for i, part := range parts[1:] { parts[i+1] = indent + part } str = strings.Join(parts, "\n") children = append(children, str) } return strings.Join(children, "\n") } func (parser *HTMLParser) basicFormatToString(node *html.Node, stripLinebreak bool) string { str := parser.nodeToTagAwareString(node.FirstChild, stripLinebreak) switch node.Data { case "b", "strong": if parser.BoldConverter != nil { return parser.BoldConverter(str) } return fmt.Sprintf("**%s**", str) case "i", "em": if parser.ItalicConverter != nil { return parser.ItalicConverter(str) } return fmt.Sprintf("_%s_", str) case "s", "del": if parser.StrikethroughConverter != nil { return parser.StrikethroughConverter(str) } return fmt.Sprintf("~~%s~~", str) case "u", "ins": if parser.UnderlineConverter != nil { return parser.UnderlineConverter(str) } case "tt", "code": if parser.MonospaceConverter != nil { return parser.MonospaceConverter(str) } } return str } func (parser *HTMLParser) headerToString(node *html.Node, stripLinebreak bool) string { children := parser.nodeToStrings(node.FirstChild, stripLinebreak) length := int(node.Data[1] - '0') prefix := strings.Repeat("#", length) + " " return prefix + strings.Join(children, "") } func (parser *HTMLParser) blockquoteToString(node *html.Node, stripLinebreak bool) string { str := parser.nodeToTagAwareString(node.FirstChild, stripLinebreak) childrenArr := strings.Split(strings.TrimSpace(str), "\n") for index, child := range childrenArr { childrenArr[index] = "> " + child } return strings.Join(childrenArr, "\n") } func (parser *HTMLParser) linkToString(node *html.Node, stripLinebreak bool) string { str := parser.nodeToTagAwareString(node.FirstChild, stripLinebreak) href := parser.getAttribute(node, "href") if len(href) == 0 { return str } match := MatrixToURL.FindStringSubmatch(href) if len(match) == 2 || len(match) == 3 { if parser.PillConverter != nil { mxid := match[1] eventID := "" if len(match) == 3 { eventID = match[2] } return parser.PillConverter(mxid, eventID) } return str } return fmt.Sprintf("%s (%s)", str, href) } func (parser *HTMLParser) tagToString(node *html.Node, stripLinebreak bool) string { switch node.Data { case "blockquote": return parser.blockquoteToString(node, stripLinebreak) case "ol", "ul": return parser.listToString(node, stripLinebreak) case "h1", "h2", "h3", "h4", "h5", "h6": return parser.headerToString(node, stripLinebreak) case "br": return parser.Newline case "b", "strong", "i", "em", "s", "del", "u", "ins", "tt", "code": return parser.basicFormatToString(node, stripLinebreak) case "a": return parser.linkToString(node, stripLinebreak) case "p": return parser.nodeToTagAwareString(node.FirstChild, stripLinebreak) + "\n" case "pre": var preStr string if node.FirstChild != nil && node.FirstChild.Type == html.ElementNode && node.FirstChild.Data == "code" { preStr = parser.nodeToString(node.FirstChild.FirstChild, false) } else { preStr = parser.nodeToString(node.FirstChild, false) } if parser.MonospaceBlockConverter != nil { return parser.MonospaceBlockConverter(preStr) } return preStr default: return parser.nodeToTagAwareString(node.FirstChild, stripLinebreak) } } func (parser *HTMLParser) singleNodeToString(node *html.Node, stripLinebreak bool) TaggedString { switch node.Type { case html.TextNode: if stripLinebreak { node.Data = strings.Replace(node.Data, "\n", "", -1) } return TaggedString{node.Data, "text"} case html.ElementNode: return TaggedString{parser.tagToString(node, stripLinebreak), node.Data} case html.DocumentNode: return TaggedString{parser.nodeToTagAwareString(node.FirstChild, stripLinebreak), "html"} default: return TaggedString{"", "unknown"} } } func (parser *HTMLParser) nodeToTaggedStrings(node *html.Node, stripLinebreak bool) (strs []TaggedString) { for ; node != nil; node = node.NextSibling { strs = append(strs, parser.singleNodeToString(node, stripLinebreak)) } return } var BlockTags = []string{"p", "h1", "h2", "h3", "h4", "h5", "h6", "ol", "ul", "pre", "blockquote", "div", "hr", "table"} func (parser *HTMLParser) isBlockTag(tag string) bool { for _, blockTag := range BlockTags { if tag == blockTag { return true } } return false } func (parser *HTMLParser) nodeToTagAwareString(node *html.Node, stripLinebreak bool) string { strs := parser.nodeToTaggedStrings(node, stripLinebreak) var output strings.Builder for _, str := range strs { tstr := str.string if parser.isBlockTag(str.tag) { tstr = fmt.Sprintf("\n%s\n", tstr) } output.WriteString(tstr) } return strings.TrimSpace(output.String()) } func (parser *HTMLParser) nodeToStrings(node *html.Node, stripLinebreak bool) (strs []string) { for ; node != nil; node = node.NextSibling { strs = append(strs, parser.singleNodeToString(node, stripLinebreak).string) } return } func (parser *HTMLParser) nodeToString(node *html.Node, stripLinebreak bool) string { return strings.Join(parser.nodeToStrings(node, stripLinebreak), "") } func (parser *HTMLParser) Parse(htmlData string) string { if parser.TabsToSpaces >= 0 { htmlData = strings.Replace(htmlData, "\t", strings.Repeat(" ", parser.TabsToSpaces), -1) } node, _ := html.Parse(strings.NewReader(htmlData)) return parser.nodeToTagAwareString(node, true) } func HTMLToText(html string) string { return (&HTMLParser{ TabsToSpaces: 4, Newline: "\n", }).Parse(html) }