contrib/rss: use the dom parser
[nit.git] / contrib / rss_downloader / src / rss_downloader.nit
1 # This file is part of NIT ( http://www.nitlanguage.org ).
2 #
3 # Copyright 2014 Alexis Laferrière <alexis.laf@xymus.net>
4 #
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
8 #
9 # http://www.apache.org/licenses/LICENSE-2.0
10 #
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
16
17 # Downloads files from RSS feeds
18 module rss_downloader
19
20 import curl
21 import dom
22
23 redef class Sys
24 # Lazy man's verbose option
25 var verbose: Bool = args.has("-v") or args.has("--verbose") is lazy
26 end
27
28 # Program configuration
29 class Config
30
31 # Folders used to infer regex (may be empty)
32 var regex_source_folders: Array[Path]
33
34 # Custom patterns (may be empty)
35 var custom_patterns: Array[Pattern]
36
37 # Download destination
38 var download_destination_folder: Path
39
40 # RSS feeds (needs at least one)
41 var rss_source_urls: Array[Text]
42
43 # Path to the log file
44 var log_path: Path
45
46 # Unique path of files names to prevent double download (may be empty)
47 var unique_pattern: Array[Pattern]
48
49 # Exception where we ignore uniqueness and can be downloaded again (may be empty)
50 var unique_exceptions: Array[Pattern]
51
52 # XML tag used for pattern recognition
53 fun tag_title: String do return "title"
54 end
55
56 # An element from an RSS feed
57 class Element
58 # Tile
59 var title: String
60
61 # Link to the file to download
62 var link: String
63
64 redef fun to_s do return "{title} @ {link}"
65
66 # Download this element to `path`
67 fun download_to(path: Text)
68 do
69 var request = new CurlHTTPRequest(link)
70 var response = request.download_to_file(path.to_s)
71
72 if response isa CurlResponseFailed then
73 sys.stderr.write "Failed downloading URL '{link}' with: {response.error_msg} ({response.error_code})\n"
74 end
75 end
76
77 # Get an unique identifier for this element, uses `Config::unique_pattern`
78 fun unique_id(config: Config): String
79 do
80 for re in config.unique_pattern do
81 var match = title.search(re)
82 if match != null then
83 return title.substring(0, match.after).to_lower
84 end
85 end
86
87 return title
88 end
89
90 # Is this element except from uniqueness?
91 fun is_unique_exception(config: Config): Bool
92 do
93 for re in config.unique_exceptions do
94 if title.has(re) then
95 return true
96 end
97 end
98 return false
99 end
100 end
101
102 # Main program structure
103 class Downloader
104 # Configuration
105 var config: Config
106
107 # Local history (read from, them written to file)
108 var history = new HashSet[Text]
109
110 # Execute tool
111 fun run
112 do
113 # Read old log from file
114 if config.log_path.exists then
115 var stream = config.log_path.open_ro
116 history.add_all stream.read_all.split("\n")
117 stream.close
118 end
119
120 # Get the pattern to search for
121 var patterns = self.patterns
122
123 # Get all the elements from the RSS feeds
124 var elements = new HashSet[Element]
125 for rss_url in config.rss_source_urls do
126 var rss = rss_url.fetch_rss_content
127 elements.add_all rss.to_rss_elements
128 end
129
130 # Select the elements matching our pattern
131 var matches = new HashSet[Element]
132 for pattern in patterns do for element in elements do
133 if element.title.has(pattern) then
134 matches.add element
135 end
136 end
137
138 if sys.verbose then
139 print "\n# {matches.length} matching elements:"
140 print matches.join("\n")
141 print "\n# Downloading..."
142 end
143
144 for element in matches do
145 var local_path = config.download_destination_folder.to_s / element.title
146 var unique_id = element.unique_id(config)
147
148 if local_path.to_path.exists then
149 # Do not redownload a file (we assume that the file name is unique by itself)
150 if sys.verbose then print "File exists, skipping {element}"
151 continue
152 else if history.has(unique_id) then
153 # Do not download a file that is not unique according to `unique_id`
154 if not element.is_unique_exception(config) then
155 # We make some exceptions
156 if sys.verbose then print "File in log, skipping {element}"
157 continue
158 end
159 end
160
161 # Download element
162 if sys.verbose then print "Fetching {element} as {local_path}"
163 element.download_to(local_path)
164
165 # Add `unique_id` to log
166 history.add unique_id
167 end
168
169 # Save new log to file
170 var stream = config.log_path.open_wo
171 for line in history do
172 stream.write line
173 stream.write "\n"
174 end
175 stream.close
176 end
177
178 # Gather all patterns from `Config::custom_patterns` and `Config::source_folder_path`
179 fun patterns: Array[Pattern]
180 do
181 var patterns = new Array[Pattern]
182
183 # Begin with custom pattern
184 for pattern_source in config.custom_patterns do
185 patterns.add pattern_source
186 end
187
188 # Get regex source from folder names
189 var folder_names = new HashSet[Text]
190 for source_folder_path in config.regex_source_folders do
191 var source_folder = source_folder_path
192
193 if not source_folder.exists then
194 sys.stderr.write "Regex source folder '{source_folder_path}' does not exists.\n"
195 continue
196 end
197
198 for dir in source_folder.files do if dir.stat.is_dir then
199 folder_names.add dir.to_s
200 end
201 end
202
203 # Compile our infered patterns
204 for folder_name in folder_names do
205 # Transform from "Some folder name" to "^Some.folder.name"
206 var regex_source = folder_name.
207 replace(' ', ".").replace('[', "\\[").replace('(', "\\(").
208 replace('+', "\\+").replace('*', "\\*")
209 regex_source = "^" + regex_source
210
211 var regex = regex_source.to_re
212 regex.ignore_case = true
213
214 patterns.add regex
215 end
216
217 if patterns.is_empty then
218 sys.stderr.write "Do not have any pattern to work with.\n"
219 exit 1
220 end
221
222 if sys.verbose then
223 print "# Generated {patterns.length} patterns"
224 print patterns.join("\n")
225 end
226
227 return patterns
228 end
229 end
230
231 redef class Text
232 # Get the content of the RSS feed at `self`
233 fun fetch_rss_content: Text
234 do
235 if sys.verbose then print "\n# Downloading RSS file from '{self}'"
236
237 var request = new CurlHTTPRequest(to_s)
238 var response = request.execute
239
240 if response isa CurlResponseSuccess then
241 var body = response.body_str
242 if sys.verbose then print "Download successful"
243 return body
244 else if response isa CurlResponseFailed then
245 sys.stderr.write "Failed downloading URL '{self}' with: {response.error_msg} ({response.error_code})\n"
246 exit 1
247 end
248
249 abort
250 end
251
252 # Get this RSS feed content as an `Array[Element]`
253 fun to_rss_elements: Array[Element]
254 do
255 var xml = to_xml
256 var items = xml["rss"].first["channel"].first["item"]
257
258 var elements = new Array[Element]
259 for item in items do
260 var title = item[tool_config.tag_title].first.as(XMLStartTag).data
261 var link = item["link"].first.as(XMLStartTag).data
262
263 elements.add new Element(title, link)
264 end
265
266 if sys.verbose then
267 print "# Found elements:"
268 print elements.join("\n")
269 end
270
271 return elements
272 end
273 end
274
275 # Implement this method in your module to configure this tool
276 fun tool_config: nullable Config do return null
277
278 var c = tool_config
279 if c == null then
280 print "This tool is not configured, take a look at the example `sample_config.nit`"
281 exit 1
282 abort # For the flow only
283 end
284
285 var tool = new Downloader(c)
286 tool.run