forked from emmaly/go-pkg-rss
-
Notifications
You must be signed in to change notification settings - Fork 0
/
feed.go
237 lines (194 loc) · 6.45 KB
/
feed.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
/*
Author: jim teeuwen <[email protected]>
Dependencies: go-pkg-xmlx (http://github.com/jteeuwen/go-pkg-xmlx)
This package allows us to fetch Rss and Atom feeds from the internet.
They are parsed into an object tree which is a hyvrid of both the RSS and Atom
standards.
Supported feeds are:
- Rss v0.91, 0.91 and 2.0
- Atom 1.0
The package allows us to maintain cache timeout management. This prevents us
from querying the servers for feed updates too often and risk ip bams. Appart
from setting a cache timeout manually, the package also optionally adheres to
the TTL, SkipDays and SkipHours values specied in the feeds themselves.
Note that the TTL, SkipDays and SkipHour fields are only part of the RSS spec.
For Atom feeds, we use the CacheTimeout in the Feed struct.
Because the object structure is a hybrid between both RSS and Atom specs, not
all fields will be filled when requesting either an RSS or Atom feed. I have
tried to create as many shared fields as possiblem but some of them simply do
not occur in either the RSS or Atom spec.
*/
package feeder
import (
"errors"
"fmt"
xmlx "github.com/jteeuwen/go-pkg-xmlx"
"strconv"
"strings"
"time"
)
type ChannelHandler func(f *Feed, newchannels []*Channel)
type ItemHandler func(f *Feed, ch *Channel, newitems []*Item)
type Feed struct {
// Custom cache timeout in minutes.
CacheTimeout int
// Make sure we adhere to the cache timeout specified in the feed. If
// our CacheTimeout is higher than that, we will use that instead.
EnforceCacheLimit bool
// Type of feed. Rss, Atom, etc
Type string
// Version of the feed. Major and Minor.
Version [2]int
// Channels with content.
Channels []*Channel
// Url from which this feed was created.
Url string
// A notification function, used to notify the host when a new channel
// has been found.
chanhandler ChannelHandler
// A notification function, used to notify the host when a new item
// has been found for a given channel.
itemhandler ItemHandler
// Last time content was fetched. Used in conjunction with CacheTimeout
// to ensure we don't get content too often.
lastupdate int64
}
func New(cachetimeout int, enforcecachelimit bool, ch ChannelHandler, ih ItemHandler) *Feed {
v := new(Feed)
v.CacheTimeout = cachetimeout
v.EnforceCacheLimit = enforcecachelimit
v.Type = "none"
v.chanhandler = ch
v.itemhandler = ih
return v
}
// This returns a timestamp of the last time the feed was updated.
// The value is in seconds.
func (this *Feed) LastUpdate() int64 { return this.lastupdate }
// Fetch retrieves the feed's latest content if necessary.
//
// The charset parameter overrides the xml decoder's CharsetReader.
// This allows us to specify a custom character encoding conversion
// routine when dealing with non-utf8 input. Supply 'nil' to use the
// default from Go's xml package.
func (this *Feed) Fetch(uri string, charset xmlx.CharsetFunc) (err error) {
if !this.CanUpdate() {
return
}
this.Url = uri
// Extract type and version of the feed so we can have the appropriate
// function parse it (rss 0.91, rss 0.92, rss 2, atom etc).
doc := xmlx.New()
if err = doc.LoadUri(uri, charset); err != nil {
return
}
this.Type, this.Version = this.GetVersionInfo(doc)
if ok := this.testVersions(); !ok {
err = errors.New(fmt.Sprintf("Unsupported feed: %s, version: %+v", this.Type, this.Version))
return
}
chancount := len(this.Channels)
if err = this.buildFeed(doc); err != nil || len(this.Channels) == 0 {
return
}
// Notify host of new channels
if chancount != len(this.Channels) && this.chanhandler != nil {
this.chanhandler(this, this.Channels[chancount:])
}
// reset cache timeout values according to feed specified values (TTL)
if this.EnforceCacheLimit && this.CacheTimeout < this.Channels[0].TTL {
this.CacheTimeout = this.Channels[0].TTL
}
return
}
// This function returns true or false, depending on whether the CacheTimeout
// value has expired or not. Additionally, it will ensure that we adhere to the
// RSS spec's SkipDays and SkipHours values (if Feed.EnforceCacheLimit is set to
// true). If this function returns true, you can be sure that a fresh feed
// update will be performed.
func (this *Feed) CanUpdate() bool {
// Make sure we are not within the specified cache-limit.
// This ensures we don't request data too often.
utc := time.Now().UTC()
if utc.UnixNano()-this.lastupdate < int64(this.CacheTimeout*60) {
return false
}
// If skipDays or skipHours are set in the RSS feed, use these to see if
// we can update.
if len(this.Channels) == 0 && this.Type == "rss" {
if this.EnforceCacheLimit && len(this.Channels[0].SkipDays) > 0 {
for _, v := range this.Channels[0].SkipDays {
if time.Weekday(v) == utc.Weekday() {
return false
}
}
}
if this.EnforceCacheLimit && len(this.Channels[0].SkipHours) > 0 {
for _, v := range this.Channels[0].SkipHours {
if v == utc.Hour() {
return false
}
}
}
}
this.lastupdate = utc.UnixNano()
return true
}
// Returns the number of seconds needed to elapse
// before the feed should update.
func (this *Feed) SecondsTillUpdate() int64 {
utc := time.Now().UTC()
return int64(this.CacheTimeout*60) - (utc.Unix() - (this.lastupdate / 1e9))
}
func (this *Feed) buildFeed(doc *xmlx.Document) (err error) {
switch this.Type {
case "rss":
err = this.readRss2(doc)
case "atom":
err = this.readAtom(doc)
}
return
}
func (this *Feed) testVersions() bool {
switch this.Type {
case "rss":
if this.Version[0] > 2 || (this.Version[0] == 2 && this.Version[1] > 0) {
return false
}
case "atom":
if this.Version[0] > 1 || (this.Version[0] == 1 && this.Version[1] > 0) {
return false
}
default:
return false
}
return true
}
func (this *Feed) GetVersionInfo(doc *xmlx.Document) (ftype string, fversion [2]int) {
var node *xmlx.Node
if node = doc.SelectNode("http://www.w3.org/2005/Atom", "feed"); node == nil {
goto rss
}
ftype = "atom"
fversion = [2]int{1, 0}
return
rss:
if node = doc.SelectNode("", "rss"); node != nil {
ftype = "rss"
version := node.As("", "version")
p := strings.Index(version, ".")
major, _ := strconv.Atoi(version[0:p])
minor, _ := strconv.Atoi(version[p+1 : len(version)])
fversion = [2]int{major, minor}
return
}
// issue#5: Some documents have an RDF root node instead of rss.
if node = doc.SelectNode("http://www.w3.org/1999/02/22-rdf-syntax-ns#", "RDF"); node != nil {
ftype = "rss"
fversion = [2]int{1, 1}
return
}
ftype = "unknown"
fversion = [2]int{0, 0}
return
}