d512e9f4160128840f544e4752801d7a7e36f208
[nit.git] / contrib / rss_downloader / src / rss_downloader.nit
1 # This file is part of NIT ( http://www.nitlanguage.org ).
2 #
3 # Copyright 2014 Alexis Laferrière <alexis.laf@xymus.net>
4 #
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
8 #
9 # http://www.apache.org/licenses/LICENSE-2.0
10 #
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
16
17 # Downloads files from RSS feeds
18 module rss_downloader
19
20 import curl
21 import dom
22
23 redef class Sys
24 # Lazy man's verbose option
25 var verbose: Bool = args.has("-v") or args.has("--verbose") is lazy
26 end
27
28 # Program configuration
29 class Config
30
31 # Folders used to infer regex (may be empty)
32 var regex_source_folders: Array[Path]
33
34 # Custom patterns (may be empty)
35 var custom_patterns: Array[Pattern]
36
37 # Download destination
38 var download_destination_folder: Path
39
40 # RSS feeds (needs at least one)
41 var rss_source_urls: Array[Text]
42
43 # Path to the log file
44 var log_path: Path
45
46 # Unique path of files names to prevent double download (may be empty)
47 var unique_pattern: Array[Pattern]
48
49 # Exception where we ignore uniqueness and can be downloaded again (may be empty)
50 var unique_exceptions: Array[Pattern]
51
52 # XML tag used for pattern recognition
53 fun tag_title: String do return "title"
54
55 # XML tag of the link to act upon
56 fun tag_link: String do return "link"
57
58 # Are the feeds at `rss_source_urls` compressed?
59 var compressed: nullable Bool
60
61 # Action to apply on each selected RSS element
62 fun act_on(element: Element)
63 do
64 var local_path = download_destination_folder.to_s / element.title
65 element.download_to(local_path)
66 end
67 end
68
69 # An element from an RSS feed
70 class Element
71 # Tile
72 var title: String
73
74 # Link to the file to download
75 var link: String
76
77 redef fun to_s do return "{title} @ {link}"
78
79 # Download this element to `path`
80 fun download_to(path: Text)
81 do
82 var request = new CurlHTTPRequest(link)
83 var response = request.download_to_file(path.to_s)
84
85 if response isa CurlResponseFailed then
86 sys.stderr.write "Failed downloading URL '{link}' with: {response.error_msg} ({response.error_code})\n"
87 end
88 end
89
90 # Get an unique identifier for this element, uses `Config::unique_pattern`
91 fun unique_id(config: Config): String
92 do
93 for re in config.unique_pattern do
94 var match = title.search(re)
95 if match != null then
96 return title.substring(0, match.after).to_lower
97 end
98 end
99
100 return title
101 end
102
103 # Is this element except from uniqueness?
104 fun is_unique_exception(config: Config): Bool
105 do
106 for re in config.unique_exceptions do
107 if title.has(re) then
108 return true
109 end
110 end
111 return false
112 end
113 end
114
115 # Main program structure
116 class Downloader
117 # Configuration
118 var config: Config
119
120 # Local history (read from, them written to file)
121 var history = new HashSet[Text]
122
123 # Execute tool
124 fun run
125 do
126 # Read old log from file
127 if config.log_path.exists then
128 var stream = config.log_path.open_ro
129 history.add_all stream.read_all.split("\n")
130 stream.close
131 end
132
133 # Get the pattern to search for
134 var patterns = self.patterns
135
136 # Get all the elements from the RSS feeds
137 var elements = new HashSet[Element]
138 for rss_url in config.rss_source_urls do
139 var rss = rss_url.fetch_rss_content
140 if config.compressed == true then rss = rss.gunzip
141 elements.add_all rss.to_rss_elements
142 end
143
144 # Select the elements matching our pattern
145 var matches = new HashSet[Element]
146 for pattern in patterns do for element in elements do
147 if element.title.has(pattern) then
148 matches.add element
149 end
150 end
151
152 if sys.verbose then
153 print "\n# {matches.length} matching elements:"
154 print matches.join("\n")
155 print "\n# Downloading..."
156 end
157
158 for element in matches do
159 var unique_id = element.unique_id(config)
160
161 if history.has(unique_id) then
162 # Do not download a file that is not unique according to `unique_id`
163 if not element.is_unique_exception(config) then
164 # We make some exceptions
165 if sys.verbose then print "File in log, skipping {element}"
166 continue
167 end
168 end
169
170 # Download element
171 if sys.verbose then print "Acting on {element}"
172
173 tool_config.act_on element
174
175 # Add `unique_id` to log
176 history.add unique_id
177 end
178
179 # Save new log to file
180 var stream = config.log_path.open_wo
181 for line in history do
182 stream.write line
183 stream.write "\n"
184 end
185 stream.close
186 end
187
188 # Gather all patterns from `Config::custom_patterns` and `Config::source_folder_path`
189 fun patterns: Array[Pattern]
190 do
191 var patterns = new Array[Pattern]
192
193 # Begin with custom pattern
194 for pattern_source in config.custom_patterns do
195 patterns.add pattern_source
196 end
197
198 # Get regex source from folder names
199 var folder_names = new HashSet[Text]
200 for source_folder_path in config.regex_source_folders do
201 var source_folder = source_folder_path
202
203 if not source_folder.exists then
204 sys.stderr.write "Regex source folder '{source_folder_path}' does not exists.\n"
205 continue
206 end
207
208 for dir in source_folder.files do if dir.stat.is_dir then
209 folder_names.add dir.filename
210 end
211 end
212
213 # Compile our infered patterns
214 for folder_name in folder_names do
215 # Transform from "Some folder name" to "^Some.folder.name"
216 var regex_source = folder_name.
217 replace(' ', ".").replace('[', "\\[").replace('(', "\\(").
218 replace('+', "\\+").replace('*', "\\*")
219 regex_source = "^" + regex_source
220
221 var regex = regex_source.to_re
222 regex.ignore_case = true
223
224 patterns.add regex
225 end
226
227 if patterns.is_empty then
228 sys.stderr.write "Do not have any pattern to work with.\n"
229 exit 1
230 end
231
232 if sys.verbose then
233 print "# Generated {patterns.length} patterns"
234 print patterns.join("\n")
235 end
236
237 return patterns
238 end
239 end
240
241 redef class Text
242 # Get the content of the RSS feed at `self`
243 fun fetch_rss_content: Text
244 do
245 if sys.verbose then print "\n# Downloading RSS file from '{self}'"
246
247 var request = new CurlHTTPRequest(to_s)
248 var response = request.execute
249
250 if response isa CurlResponseSuccess then
251 var body = response.body_str
252 if sys.verbose then print "Download successful"
253 return body
254 else if response isa CurlResponseFailed then
255 sys.stderr.write "Failed downloading URL '{self}' with: {response.error_msg} ({response.error_code})\n"
256 exit 1
257 end
258
259 abort
260 end
261
262 # Get this RSS feed content as an `Array[Element]`
263 fun to_rss_elements: Array[Element]
264 do
265 var xml = to_xml
266 if xml isa XMLError then
267 print_error "RSS Parse Error: {xml.message}:{xml.location or else "null"}"
268 return new Array[Element]
269 end
270 var items = xml["rss"].first["channel"].first["item"]
271
272 var elements = new Array[Element]
273 for item in items do
274 var title = item[tool_config.tag_title].first.as(XMLStartTag).data
275 var link = item[tool_config.tag_link].first.as(XMLStartTag).data
276
277 elements.add new Element(title, link)
278 end
279
280 if sys.verbose then
281 print "# Found elements:"
282 print "* " + elements.join("\n* ")
283 end
284
285 return elements
286 end
287
288 # Expand the Lempel-Ziv encoded `self`
289 fun gunzip: String
290 do
291 var proc = new ProcessDuplex("gunzip", new Array[String]...)
292 proc.write self
293 proc.stream_out.close
294 var res = proc.read_all
295 proc.stream_in.close
296 proc.wait
297 assert proc.status == 0
298 return res
299 end
300 end
301
302 # Implement this method in your module to configure this tool
303 fun tool_config: nullable Config do return null
304
305 var c = tool_config
306 if c == null then
307 print "This tool is not configured, take a look at the example `sample_config.nit`"
308 exit 1
309 abort # For the flow only
310 end
311
312 var tool = new Downloader(c)
313 tool.run