aboutsummaryrefslogtreecommitdiff
path: root/lib/htmlparser/htmlparser.go
blob: fb4e01285a9641120cc2ddb40966aaa8c8e6b0ae (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
// gomuks - A terminal Matrix client written in Go.
// Copyright (C) 2018 Tulir Asokan
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program.  If not, see <http://www.gnu.org/licenses/>.

package htmlparser

import (
	"io"
	"strings"

	"golang.org/x/net/html"
)

type HTMLProcessor interface {
	Preprocess()
	HandleStartTag(tagName string, attrs map[string]string)
	HandleSelfClosingTag(tagName string, attrs map[string]string)
	HandleText(text string)
	HandleEndTag(tagName string)
	ReceiveError(err error)
	Postprocess()
}

type HTMLParser struct {
	*html.Tokenizer
	processor HTMLProcessor
}

func NewHTMLParserFromTokenizer(z *html.Tokenizer, processor HTMLProcessor) HTMLParser {
	return HTMLParser{
		z,
		processor,
	}
}

func NewHTMLParserFromReader(reader io.Reader, processor HTMLProcessor) HTMLParser {
	return NewHTMLParserFromTokenizer(html.NewTokenizer(reader), processor)
}

func NewHTMLParserFromString(html string, processor HTMLProcessor) HTMLParser {
	return NewHTMLParserFromReader(strings.NewReader(html), processor)
}

var SelfClosingTags = []string{"img", "br", "hr", "area", "base", "basefont", "input", "link", "meta"}

func (parser HTMLParser) mapAttrs() map[string]string {
	attrs := make(map[string]string)
	hasMore := true
	for hasMore {
		var key, val []byte
		key, val, hasMore = parser.TagAttr()
		attrs[string(key)] = string(val)
	}
	return attrs
}

func (parser HTMLParser) isSelfClosing(tag string) bool {
	for _, selfClosingTag := range SelfClosingTags {
		if tag == selfClosingTag {
			return true
		}
	}
	return false
}

func (parser HTMLParser) Process() {
	parser.processor.Preprocess()
Loop:
	for {
		tt := parser.Next()
		switch tt {
		case html.ErrorToken:
			parser.processor.ReceiveError(parser.Err())
			break Loop
		case html.TextToken:
			parser.processor.HandleText(string(parser.Text()))
		case html.StartTagToken, html.SelfClosingTagToken:
			tagb, _ := parser.TagName()
			attrs := parser.mapAttrs()
			tag := string(tagb)

			selfClosing := tt == html.SelfClosingTagToken || parser.isSelfClosing(tag)

			if selfClosing {
				parser.processor.HandleSelfClosingTag(tag, attrs)
			} else {
				parser.processor.HandleStartTag(tag, attrs)
			}
		case html.EndTagToken:
			tagb, _ := parser.TagName()
			parser.processor.HandleEndTag(string(tagb))
		}
	}

	parser.processor.Postprocess()
}