aboutsummaryrefslogtreecommitdiff
path: root/lib/htmlparser/htmlparser.go
blob: 5ef8d98deb5a7d1f4025c1c6349719495e224199 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
// gomuks - A terminal Matrix client written in Go.
// Copyright (C) 2018 Tulir Asokan
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program.  If not, see <http://www.gnu.org/licenses/>.

package htmlparser

import (
	"io"
	"strings"

	"golang.org/x/net/html"
)

// HTMLProcessor contains the functions to process parsed HTML data.
type HTMLProcessor interface {
	// Preprocess is called before the parsing is started.
	Preprocess()

	// HandleStartTag is called with the tag name and attributes when
	// the parser encounters a StartTagToken, except if the tag is
	// always self-closing.
	HandleStartTag(tagName string, attrs map[string]string)
	// HandleSelfClosingTag is called with the tag name and attributes
	// when the parser encounters a SelfClosingTagToken OR a StartTagToken
	// with a tag that's always self-closing.
	HandleSelfClosingTag(tagName string, attrs map[string]string)
	// HandleText is called with the text when the parser encounters
	// a TextToken.
	HandleText(text string)
	// HandleEndTag is called with the tag name when the parser encounters
	// an EndTagToken.
	HandleEndTag(tagName string)

	// ReceiveError is called with the error when the parser encounters
	// an ErrorToken that IS NOT io.EOF.
	ReceiveError(err error)

	// Postprocess is called after parsing is completed successfully.
	// An unsuccessful parsing will trigger a ReceiveError() call.
	Postprocess()
}

// HTMLParser wraps a net/html.Tokenizer and a HTMLProcessor to call
// the HTMLProcessor with data from the Tokenizer.
type HTMLParser struct {
	*html.Tokenizer
	processor HTMLProcessor
}

// NewHTMLParserFromTokenizer creates a new HTMLParser from an existing html Tokenizer.
func NewHTMLParserFromTokenizer(z *html.Tokenizer, processor HTMLProcessor) HTMLParser {
	return HTMLParser{
		z,
		processor,
	}
}

// NewHTMLParserFromReader creates a Tokenizer with the given io.Reader and
// then uses that to create a new HTMLParser.
func NewHTMLParserFromReader(reader io.Reader, processor HTMLProcessor) HTMLParser {
	return NewHTMLParserFromTokenizer(html.NewTokenizer(reader), processor)
}

// NewHTMLParserFromString creates a Tokenizer with a reader of the given
// string and then uses that to create a new HTMLParser.
func NewHTMLParserFromString(html string, processor HTMLProcessor) HTMLParser {
	return NewHTMLParserFromReader(strings.NewReader(html), processor)
}

// SelfClosingTags is the list of tags that always call
// HTMLProcessor.HandleSelfClosingTag() even if it is encountered
// as a html.StartTagToken rather than html.SelfClosingTagToken.
var SelfClosingTags = []string{"img", "br", "hr", "area", "base", "basefont", "input", "link", "meta"}

func (parser HTMLParser) mapAttrs() map[string]string {
	attrs := make(map[string]string)
	hasMore := true
	for hasMore {
		var key, val []byte
		key, val, hasMore = parser.TagAttr()
		attrs[string(key)] = string(val)
	}
	return attrs
}

func (parser HTMLParser) isSelfClosing(tag string) bool {
	for _, selfClosingTag := range SelfClosingTags {
		if tag == selfClosingTag {
			return true
		}
	}
	return false
}

// Process parses the HTML using the tokenizer in this parser and
// calls the appropriate functions of the HTML processor.
func (parser HTMLParser) Process() {
	parser.processor.Preprocess()
Loop:
	for {
		tt := parser.Next()
		switch tt {
		case html.ErrorToken:
			if parser.Err() != io.EOF {
				parser.processor.ReceiveError(parser.Err())
				return
			}
			break Loop
		case html.TextToken:
			parser.processor.HandleText(string(parser.Text()))
		case html.StartTagToken, html.SelfClosingTagToken:
			tagb, _ := parser.TagName()
			attrs := parser.mapAttrs()
			tag := string(tagb)

			selfClosing := tt == html.SelfClosingTagToken || parser.isSelfClosing(tag)

			if selfClosing {
				parser.processor.HandleSelfClosingTag(tag, attrs)
			} else {
				parser.processor.HandleStartTag(tag, attrs)
			}
		case html.EndTagToken:
			tagb, _ := parser.TagName()
			parser.processor.HandleEndTag(string(tagb))
		}
	}

	parser.processor.Postprocess()
}