ed5fc72dc209b3163102ad8863c17141daea7e0e
[nit.git] / contrib / rss_downloader / src / rss_downloader.nit
1 # This file is part of NIT ( http://www.nitlanguage.org ).
2 #
3 # Copyright 2014 Alexis Laferrière <alexis.laf@xymus.net>
4 #
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
8 #
9 # http://www.apache.org/licenses/LICENSE-2.0
10 #
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
16
17 # Downloads files from RSS feeds
18 module rss_downloader
19
20 import curl
21
22 redef class Sys
23 # Lazy man's verbose option
24 var verbose: Bool = args.has("-v") or args.has("--verbose") is lazy
25 end
26
27 # Program configuration
28 class Config
29
30 # Folders used to infer regex (may be empty)
31 var regex_source_folders: Array[Path]
32
33 # Custom patterns (may be empty)
34 var custom_patterns: Array[Pattern]
35
36 # Download destination
37 var download_destination_folder: Path
38
39 # RSS feeds (needs at least one)
40 var rss_source_urls: Array[Text]
41
42 # Path to the log file
43 var log_path: Path
44
45 # Unique path of files names to prevent double download (may be empty)
46 var unique_pattern: Array[Pattern]
47
48 # Exception where we ignore uniqueness and can be downloaded again (may be empty)
49 var unique_exceptions: Array[Pattern]
50 end
51
52 # An element from an RSS feed
53 class Element
54 # Tile
55 var title: String
56
57 # Link to the file to download
58 var link: String
59
60 redef fun to_s do return "{title} @ {link}"
61
62 # Download this element to `path`
63 fun download_to(path: Text)
64 do
65 var request = new CurlHTTPRequest(link)
66 var response = request.download_to_file(path.to_s)
67
68 if response isa CurlResponseFailed then
69 sys.stderr.write "Failed downloading URL '{link}' with: {response.error_msg} ({response.error_code})\n"
70 end
71 end
72
73 # Get an unique identifier for this element, uses `Config::unique_pattern`
74 fun unique_id(config: Config): String
75 do
76 for re in config.unique_pattern do
77 var match = title.search(re)
78 if match != null then
79 return title.substring(0, match.after).to_lower
80 end
81 end
82
83 return title
84 end
85
86 # Is this element except from uniqueness?
87 fun is_unique_exception(config: Config): Bool
88 do
89 for re in config.unique_exceptions do
90 if title.has(re) then
91 return true
92 end
93 end
94 return false
95 end
96 end
97
98 # Main program structure
99 class Downloader
100 # Configuration
101 var config: Config
102
103 # Local history (read from, them written to file)
104 var history = new HashSet[Text]
105
106 # Execute tool
107 fun run
108 do
109 # Read old log from file
110 if config.log_path.exists then
111 var stream = config.log_path.open_ro
112 history.add_all stream.read_all.split("\n")
113 stream.close
114 end
115
116 # Get the pattern to search for
117 var patterns = self.patterns
118
119 # Get all the elements from the RSS feeds
120 var elements = new HashSet[Element]
121 for rss_url in config.rss_source_urls do
122 var rss = rss_url.fetch_rss_content
123 elements.add_all rss.to_rss_elements
124 end
125
126 # Select the elements matching our pattern
127 var matches = new HashSet[Element]
128 for pattern in patterns do for element in elements do
129 if element.title.has(pattern) then
130 matches.add element
131 end
132 end
133
134 if sys.verbose then
135 print "\n# {matches.length} matching elements:"
136 print matches.join("\n")
137 print "\n# Downloading..."
138 end
139
140 for element in matches do
141 var local_path = config.download_destination_folder.to_s / element.title
142 var unique_id = element.unique_id(config)
143
144 if local_path.to_path.exists then
145 # Do not redownload a file (we assume that the file name is unique by itself)
146 if sys.verbose then print "File exists, skipping {element}"
147 continue
148 else if history.has(unique_id) then
149 # Do not download a file that is not unique according to `unique_id`
150 if not element.is_unique_exception(config) then
151 # We make some exceptions
152 if sys.verbose then print "File in log, skipping {element}"
153 continue
154 end
155 end
156
157 # Download element
158 if sys.verbose then print "Fetching {element} as {local_path}"
159 element.download_to(local_path)
160
161 # Add `unique_id` to log
162 history.add unique_id
163 end
164
165 # Save new log to file
166 var stream = config.log_path.open_wo
167 for line in history do
168 stream.write line
169 stream.write "\n"
170 end
171 stream.close
172 end
173
174 # Gather all patterns from `Config::custom_patterns` and `Config::source_folder_path`
175 fun patterns: Array[Pattern]
176 do
177 var patterns = new Array[Pattern]
178
179 # Begin with custom pattern
180 for pattern_source in config.custom_patterns do
181 patterns.add pattern_source
182 end
183
184 # Get regex source from folder names
185 var folder_names = new HashSet[Text]
186 for source_folder_path in config.regex_source_folders do
187 var source_folder = source_folder_path
188
189 if not source_folder.exists then
190 sys.stderr.write "Regex source folder '{source_folder_path}' does not exists.\n"
191 continue
192 end
193
194 for dir in source_folder.files do if dir.stat.is_dir then
195 folder_names.add dir.to_s
196 end
197 end
198
199 # Compile our infered patterns
200 for folder_name in folder_names do
201 # Transform from "Some folder name" to "^Some.folder.name"
202 var regex_source = folder_name.
203 replace(' ', ".").replace('[', "\\[").replace('(', "\\(").
204 replace('+', "\\+").replace('*', "\\*")
205 regex_source = "^" + regex_source
206
207 var regex = regex_source.to_re
208 regex.ignore_case = true
209
210 patterns.add regex
211 end
212
213 if patterns.is_empty then
214 sys.stderr.write "Do not have any pattern to work with.\n"
215 exit 1
216 end
217
218 if sys.verbose then
219 print "# Generated {patterns.length} patterns"
220 print patterns.join("\n")
221 end
222
223 return patterns
224 end
225 end
226
227 redef class Text
228 # Get the content of the RSS feed at `self`
229 fun fetch_rss_content: Text
230 do
231 if sys.verbose then print "\n# Downloading RSS file from '{self}'"
232
233 var request = new CurlHTTPRequest(to_s)
234 var response = request.execute
235
236 if response isa CurlResponseSuccess then
237 var body = response.body_str
238 if sys.verbose then print "Download successful"
239 return body
240 else if response isa CurlResponseFailed then
241 sys.stderr.write "Failed downloading URL '{self}' with: {response.error_msg} ({response.error_code})\n"
242 exit 1
243 end
244
245 abort
246 end
247
248 # Get this RSS feed content as an `Array[Element]`
249 fun to_rss_elements: Array[Element]
250 do
251 var title_re = "<title><![^/]*</title>".to_re
252 var link_re = "<link>[^<]*download[^<]*</link>".to_re
253
254 var title_prefix_len = "<title><![CDATA[".length
255 var title_suffix_len = "]]</title>".length+1
256
257 var titles = search_all(title_re)
258 var links = search_all(link_re)
259
260 if sys.verbose then print "\n# Found {titles.length} titles and {links.length} links"
261 assert titles.length == links.length
262
263 var elements = new Array[Element]
264 for i in titles.length.times do
265 var title = titles[i].to_s.substring(title_prefix_len, titles[i].length - title_prefix_len - title_suffix_len)
266 var link = links[i].to_s.substring(6, links[i].length - 6 - 7)
267 elements.add new Element(title, link)
268 end
269
270 if sys.verbose then
271 print "# Found elements:"
272 print elements.join("\n")
273 end
274
275 return elements
276 end
277 end
278
279 # Implement this method in your module to configure this tool
280 fun tool_config: nullable Config do return null
281
282 var c = tool_config
283 if c == null then
284 print "This tool is not configured, take a look at the example `sample_config.nit`"
285 exit 1
286 abort # For the flow only
287 end
288
289 var tool = new Downloader(c)
290 tool.run