1 # This file is part of NIT ( http://www.nitlanguage.org ).
3 # Copyright 2014 Alexis Laferrière <alexis.laf@xymus.net>
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
9 # http://www.apache.org/licenses/LICENSE-2.0
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
17 # Downloads files from RSS feeds
24 # Lazy man's verbose option
25 var verbose
: Bool = args
.has
("-v") or args
.has
("--verbose") is lazy
28 # Program configuration
31 # Folders used to infer regex (may be empty)
32 var regex_source_folders
: Array[Path]
34 # Custom patterns (may be empty)
35 var custom_patterns
: Array[Pattern]
37 # Download destination
38 var download_destination_folder
: Path
40 # RSS feeds (needs at least one)
41 var rss_source_urls
: Array[Text]
43 # Path to the log file
46 # Unique path of files names to prevent double download (may be empty)
47 var unique_pattern
: Array[Pattern]
49 # Exception where we ignore uniqueness and can be downloaded again (may be empty)
50 var unique_exceptions
: Array[Pattern]
52 # XML tag used for pattern recognition
53 fun tag_title
: String do return "title"
55 # XML tag of the link to act upon
56 fun tag_link
: String do return "link"
58 # Are the feeds at `rss_source_urls` compressed?
59 var compressed
: nullable Bool
61 # Action to apply on each selected RSS element
62 fun act_on
(element
: Element)
64 var local_path
= download_destination_folder
.to_s
/ element
.title
65 element
.download_to
(local_path
)
69 # An element from an RSS feed
74 # Link to the file to download
77 redef fun to_s
do return "{title} @ {link}"
79 # Download this element to `path`
80 fun download_to
(path
: Text)
82 var request
= new CurlHTTPRequest(link
)
83 var response
= request
.download_to_file
(path
.to_s
)
85 if response
isa CurlResponseFailed then
86 sys
.stderr
.write
"Failed downloading URL '{link}' with: {response.error_msg} ({response.error_code})\n"
90 # Get an unique identifier for this element, uses `Config::unique_pattern`
91 fun unique_id
(config
: Config): String
93 for re
in config
.unique_pattern
do
94 var match
= title
.search
(re
)
96 return title
.substring
(0, match
.after
).to_lower
103 # Is this element except from uniqueness?
104 fun is_unique_exception
(config
: Config): Bool
106 for re
in config
.unique_exceptions
do
107 if title
.has
(re
) then
115 # Main program structure
120 # Local history (read from, them written to file)
121 var history
= new HashSet[Text]
126 # Read old log from file
127 if config
.log_path
.exists
then
128 var stream
= config
.log_path
.open_ro
129 history
.add_all stream
.read_all
.split
("\n")
133 # Get the pattern to search for
134 var patterns
= self.patterns
136 # Get all the elements from the RSS feeds
137 var elements
= new HashSet[Element]
138 for rss_url
in config
.rss_source_urls
do
139 var rss
= rss_url
.fetch_rss_content
140 if config
.compressed
== true then rss
= rss
.gunzip
141 elements
.add_all rss
.to_rss_elements
144 # Select the elements matching our pattern
145 var matches
= new HashSet[Element]
146 for pattern
in patterns
do for element
in elements
do
147 if element
.title
.has
(pattern
) then
153 print
"\n# {matches.length} matching elements:"
154 print
"* " + matches
.join
("\n* ")
155 print
"\n# Downloading..."
158 for element
in matches
do
159 var unique_id
= element
.unique_id
(config
)
161 if history
.has
(unique_id
) then
162 # Do not download a file that is not unique according to `unique_id`
163 if not element
.is_unique_exception
(config
) then
164 # We make some exceptions
165 if sys
.verbose
then print
"- Skipping {element}"
171 if sys
.verbose
then print
"+ Acting on {element}"
173 tool_config
.as(not null).act_on element
175 # Add `unique_id` to log
176 history
.add unique_id
179 # Save new log to file
180 var stream
= config
.log_path
.open_wo
181 for line
in history
do
188 # Gather all patterns from `Config::custom_patterns` and `Config::source_folder_path`
189 fun patterns
: Array[Pattern]
191 var patterns
= new Array[Pattern]
193 # Begin with custom pattern
194 for pattern_source
in config
.custom_patterns
do
195 patterns
.add pattern_source
198 # Get regex source from folder names
199 var folder_names
= new HashSet[Text]
200 for source_folder_path
in config
.regex_source_folders
do
201 var source_folder
= source_folder_path
203 if not source_folder
.exists
then
204 sys
.stderr
.write
"Regex source folder '{source_folder_path}' does not exists.\n"
208 for dir
in source_folder
.files
do if dir
.stat
.as(not null).is_dir
then
209 folder_names
.add dir
.filename
213 # Compile our infered patterns
214 for folder_name
in folder_names
do
215 # Transform from "Some folder name" to "^Some.folder.name"
216 var regex_source
= folder_name
.
217 replace
(' ', ".").replace
('[', "\\[").replace
('(', "\\(").
218 replace
('+', "\\+").replace
('*', "\\*")
219 regex_source
= "^" + regex_source
221 var regex
= regex_source
.to_re
222 regex
.ignore_case
= true
227 if patterns
.is_empty
then
228 sys
.stderr
.write
"Do not have any pattern to work with.\n"
233 print
"# Generated {patterns.length} patterns"
234 print patterns
.join
("\n")
242 # Get the content of the RSS feed at `self`
243 fun fetch_rss_content
: Text
245 if sys
.verbose
then print
"\n# Downloading RSS file from '{self}'"
247 var request
= new CurlHTTPRequest(to_s
)
248 var response
= request
.execute
250 if response
isa CurlResponseSuccess then
251 var body
= response
.body_str
252 if sys
.verbose
then print
"Download successful"
254 else if response
isa CurlResponseFailed then
255 sys
.stderr
.write
"Failed downloading URL '{self}' with: {response.error_msg} ({response.error_code})\n"
262 # Get this RSS feed content as an `Array[Element]`
263 fun to_rss_elements
: Array[Element]
266 if xml
isa XMLError then
267 print_error
"RSS Parse Error: {xml.message}:{xml.location or else "null"}"
268 return new Array[Element]
270 var items
= xml
["rss"].first
["channel"].first
["item"]
272 var elements
= new Array[Element]
274 var title
= item
[tool_config
.as(not null).tag_title
].first
.as(XMLStartTag).data
275 var link
= item
[tool_config
.as(not null).tag_link
].first
.as(XMLStartTag).data
277 elements
.add
new Element(title
, link
)
281 print
"# Found elements:"
282 print
"* " + elements
.join
("\n* ")
288 # Expand the Lempel-Ziv encoded `self`
291 var proc
= new ProcessDuplex("gunzip", new Array[String]...)
292 var res
= proc
.write_and_read
(self)
293 assert proc
.status
== 0 else print_error
"gunzip failed: {proc.last_error or else "Unknown"}"
298 # Implement this method in your module to configure this tool
299 fun tool_config
: nullable Config do return null
303 print
"This tool is not configured, take a look at the example `sample_config.nit`"
305 abort # For the flow only
308 var tool
= new Downloader(c
)