323a84e09272ecc4221748a867b0c8bb09a04acd
[nit.git] / contrib / rss_downloader / src / rss_downloader.nit
1 # This file is part of NIT ( http://www.nitlanguage.org ).
2 #
3 # Copyright 2014 Alexis Laferrière <alexis.laf@xymus.net>
4 #
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
8 #
9 # http://www.apache.org/licenses/LICENSE-2.0
10 #
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
16
17 # Downloads files from RSS feeds
18 module rss_downloader
19
20 import curl
21 import dom
22
23 redef class Sys
24 # Lazy man's verbose option
25 var verbose: Bool = args.has("-v") or args.has("--verbose") is lazy
26 end
27
28 # Program configuration
29 class Config
30
31 # Folders used to infer regex (may be empty)
32 var regex_source_folders: Array[Path]
33
34 # Custom patterns (may be empty)
35 var custom_patterns: Array[Pattern]
36
37 # Download destination
38 var download_destination_folder: Path
39
40 # RSS feeds (needs at least one)
41 var rss_source_urls: Array[Text]
42
43 # Path to the log file
44 var log_path: Path
45
46 # Unique path of files names to prevent double download (may be empty)
47 var unique_pattern: Array[Pattern]
48
49 # Exception where we ignore uniqueness and can be downloaded again (may be empty)
50 var unique_exceptions: Array[Pattern]
51
52 # XML tag used for pattern recognition
53 fun tag_title: String do return "title"
54
55 # Action to apply on each selected RSS element
56 fun act_on(element: Element)
57 do
58 var local_path = download_destination_folder.to_s / element.title
59 element.download_to(local_path)
60 end
61 end
62
63 # An element from an RSS feed
64 class Element
65 # Tile
66 var title: String
67
68 # Link to the file to download
69 var link: String
70
71 redef fun to_s do return "{title} @ {link}"
72
73 # Download this element to `path`
74 fun download_to(path: Text)
75 do
76 var request = new CurlHTTPRequest(link)
77 var response = request.download_to_file(path.to_s)
78
79 if response isa CurlResponseFailed then
80 sys.stderr.write "Failed downloading URL '{link}' with: {response.error_msg} ({response.error_code})\n"
81 end
82 end
83
84 # Get an unique identifier for this element, uses `Config::unique_pattern`
85 fun unique_id(config: Config): String
86 do
87 for re in config.unique_pattern do
88 var match = title.search(re)
89 if match != null then
90 return title.substring(0, match.after).to_lower
91 end
92 end
93
94 return title
95 end
96
97 # Is this element except from uniqueness?
98 fun is_unique_exception(config: Config): Bool
99 do
100 for re in config.unique_exceptions do
101 if title.has(re) then
102 return true
103 end
104 end
105 return false
106 end
107 end
108
109 # Main program structure
110 class Downloader
111 # Configuration
112 var config: Config
113
114 # Local history (read from, them written to file)
115 var history = new HashSet[Text]
116
117 # Execute tool
118 fun run
119 do
120 # Read old log from file
121 if config.log_path.exists then
122 var stream = config.log_path.open_ro
123 history.add_all stream.read_all.split("\n")
124 stream.close
125 end
126
127 # Get the pattern to search for
128 var patterns = self.patterns
129
130 # Get all the elements from the RSS feeds
131 var elements = new HashSet[Element]
132 for rss_url in config.rss_source_urls do
133 var rss = rss_url.fetch_rss_content
134 elements.add_all rss.to_rss_elements
135 end
136
137 # Select the elements matching our pattern
138 var matches = new HashSet[Element]
139 for pattern in patterns do for element in elements do
140 if element.title.has(pattern) then
141 matches.add element
142 end
143 end
144
145 if sys.verbose then
146 print "\n# {matches.length} matching elements:"
147 print matches.join("\n")
148 print "\n# Downloading..."
149 end
150
151 for element in matches do
152 var unique_id = element.unique_id(config)
153
154 if history.has(unique_id) then
155 # Do not download a file that is not unique according to `unique_id`
156 if not element.is_unique_exception(config) then
157 # We make some exceptions
158 if sys.verbose then print "File in log, skipping {element}"
159 continue
160 end
161 end
162
163 # Download element
164 if sys.verbose then print "Acting on {element}"
165
166 tool_config.act_on element
167
168 # Add `unique_id` to log
169 history.add unique_id
170 end
171
172 # Save new log to file
173 var stream = config.log_path.open_wo
174 for line in history do
175 stream.write line
176 stream.write "\n"
177 end
178 stream.close
179 end
180
181 # Gather all patterns from `Config::custom_patterns` and `Config::source_folder_path`
182 fun patterns: Array[Pattern]
183 do
184 var patterns = new Array[Pattern]
185
186 # Begin with custom pattern
187 for pattern_source in config.custom_patterns do
188 patterns.add pattern_source
189 end
190
191 # Get regex source from folder names
192 var folder_names = new HashSet[Text]
193 for source_folder_path in config.regex_source_folders do
194 var source_folder = source_folder_path
195
196 if not source_folder.exists then
197 sys.stderr.write "Regex source folder '{source_folder_path}' does not exists.\n"
198 continue
199 end
200
201 for dir in source_folder.files do if dir.stat.is_dir then
202 folder_names.add dir.filename
203 end
204 end
205
206 # Compile our infered patterns
207 for folder_name in folder_names do
208 # Transform from "Some folder name" to "^Some.folder.name"
209 var regex_source = folder_name.
210 replace(' ', ".").replace('[', "\\[").replace('(', "\\(").
211 replace('+', "\\+").replace('*', "\\*")
212 regex_source = "^" + regex_source
213
214 var regex = regex_source.to_re
215 regex.ignore_case = true
216
217 patterns.add regex
218 end
219
220 if patterns.is_empty then
221 sys.stderr.write "Do not have any pattern to work with.\n"
222 exit 1
223 end
224
225 if sys.verbose then
226 print "# Generated {patterns.length} patterns"
227 print patterns.join("\n")
228 end
229
230 return patterns
231 end
232 end
233
234 redef class Text
235 # Get the content of the RSS feed at `self`
236 fun fetch_rss_content: Text
237 do
238 if sys.verbose then print "\n# Downloading RSS file from '{self}'"
239
240 var request = new CurlHTTPRequest(to_s)
241 var response = request.execute
242
243 if response isa CurlResponseSuccess then
244 var body = response.body_str
245 if sys.verbose then print "Download successful"
246 return body
247 else if response isa CurlResponseFailed then
248 sys.stderr.write "Failed downloading URL '{self}' with: {response.error_msg} ({response.error_code})\n"
249 exit 1
250 end
251
252 abort
253 end
254
255 # Get this RSS feed content as an `Array[Element]`
256 fun to_rss_elements: Array[Element]
257 do
258 var xml = to_xml
259 var items = xml["rss"].first["channel"].first["item"]
260
261 var elements = new Array[Element]
262 for item in items do
263 var title = item[tool_config.tag_title].first.as(XMLStartTag).data
264 var link = item["link"].first.as(XMLStartTag).data
265
266 elements.add new Element(title, link)
267 end
268
269 if sys.verbose then
270 print "# Found elements:"
271 print elements.join("\n")
272 end
273
274 return elements
275 end
276 end
277
278 # Implement this method in your module to configure this tool
279 fun tool_config: nullable Config do return null
280
281 var c = tool_config
282 if c == null then
283 print "This tool is not configured, take a look at the example `sample_config.nit`"
284 exit 1
285 abort # For the flow only
286 end
287
288 var tool = new Downloader(c)
289 tool.run