1 # This file is part of NIT ( http://www.nitlanguage.org ).
3 # Copyright 2014 Alexis Laferrière <alexis.laf@xymus.net>
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
9 # http://www.apache.org/licenses/LICENSE-2.0
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
17 # Downloads files from RSS feeds
23 # Lazy man's verbose option
24 var verbose
: Bool = args
.has
("-v") or args
.has
("--verbose") is lazy
27 # Program configuration
30 # Folders used to infer regex (may be empty)
31 var regex_source_folders
: Array[Path]
33 # Custom patterns (may be empty)
34 var custom_patterns
: Array[Pattern]
36 # Download destination
37 var download_destination_folder
: Path
39 # RSS feeds (needs at least one)
40 var rss_source_urls
: Array[Text]
42 # Path to the log file
45 # Unique path of files names to prevent double download (may be empty)
46 var unique_pattern
: Array[Pattern]
48 # Exception where we ignore uniqueness and can be downloaded again (may be empty)
49 var unique_exceptions
: Array[Pattern]
52 # An element from an RSS feed
57 # Link to the file to download
60 redef fun to_s
do return "{title} @ {link}"
62 # Download this element to `path`
63 fun download_to
(path
: Text)
67 var request
= new CurlHTTPRequest(link
, curl
)
68 var response
= request
.download_to_file
(path
.to_s
)
70 if response
isa CurlFileResponseSuccess then
72 else if response
isa CurlResponseFailed then
73 sys
.stderr
.write
"Failed downloading URL '{link}' with: {response.error_msg} ({response.error_code})\n"
77 # Get an unique identifier for this element, uses `Config::unique_pattern`
78 fun unique_id
(config
: Config): String
80 for re
in config
.unique_pattern
do
81 var match
= title
.search
(re
)
83 return title
.substring
(0, match
.after
).to_lower
90 # Is this element except from uniqueness?
91 fun is_unique_exception
(config
: Config): Bool
93 for re
in config
.unique_exceptions
do
102 # Main program structure
107 # Local history (read from, them written to file)
108 var history
= new HashSet[Text]
113 # Read old log from file
114 if config
.log_path
.exists
then
115 var stream
= config
.log_path
.open_ro
116 history
.add_all stream
.read_all
.split
("\n")
120 # Get the pattern to search for
121 var patterns
= self.patterns
123 # Get all the elements from the RSS feeds
124 var elements
= new HashSet[Element]
125 for rss_url
in config
.rss_source_urls
do
126 var rss
= rss_url
.fetch_rss_content
127 elements
.add_all rss
.to_rss_elements
130 # Select the elements matching our pattern
131 var matches
= new HashSet[Element]
132 for pattern
in patterns
do for element
in elements
do
133 if element
.title
.has
(pattern
) then
139 print
"\n# {matches.length} matching elements:"
140 print matches
.join
("\n")
141 print
"\n# Downloading..."
144 for element
in matches
do
145 var local_path
= config
.download_destination_folder
.to_s
/ element
.title
146 var unique_id
= element
.unique_id
(config
)
148 if local_path
.to_path
.exists
then
149 # Do not redownload a file (we assume that the file name is unique by itself)
150 if sys
.verbose
then print
"File exists, skipping {element}"
152 else if history
.has
(unique_id
) then
153 # Do not download a file that is not unique according to `unique_id`
154 if not element
.is_unique_exception
(config
) then
155 # We make some exceptions
156 if sys
.verbose
then print
"File in log, skipping {element}"
162 if sys
.verbose
then print
"Fetching {element} as {local_path}"
163 element
.download_to
(local_path
)
165 # Add `unique_id` to log
166 history
.add unique_id
169 # Save new log to file
170 var stream
= config
.log_path
.open_wo
171 for line
in history
do
178 # Gather all patterns from `Config::custom_patterns` and `Config::source_folder_path`
179 fun patterns
: Array[Pattern]
181 var patterns
= new Array[Pattern]
183 # Begin with custom pattern
184 for pattern_source
in config
.custom_patterns
do
185 patterns
.add pattern_source
188 # Get regex source from folder names
189 var folder_names
= new HashSet[Text]
190 for source_folder_path
in config
.regex_source_folders
do
191 var source_folder
= source_folder_path
193 if not source_folder
.exists
then
194 sys
.stderr
.write
"Regex source folder '{source_folder_path}' does not exists.\n"
198 for dir
in source_folder
.files
do if dir
.stat
.is_dir
then
199 folder_names
.add dir
.to_s
203 # Compile our infered patterns
204 for folder_name
in folder_names
do
205 # Transform from "Some folder name" to "^Some.folder.name"
206 var regex_source
= folder_name
.
207 replace
(' ', ".").replace
('[', "\\[").replace
('(', "\\(").
208 replace
('+', "\\+").replace
('*', "\\*")
209 regex_source
= "^" + regex_source
211 var regex
= regex_source
.to_re
212 regex
.ignore_case
= true
217 if patterns
.is_empty
then
218 sys
.stderr
.write
"Do not have any pattern to work with.\n"
223 print
"# Generated {patterns.length} patterns"
224 print patterns
.join
("\n")
232 # Get the content of the RSS feed at `self`
233 fun fetch_rss_content
: Text
237 if sys
.verbose
then print
"\n# Downloading RSS file from '{self}'"
239 var request
= new CurlHTTPRequest(to_s
, curl
)
240 var response
= request
.execute
242 if response
isa CurlResponseSuccess then
243 var body
= response
.body_str
245 if sys
.verbose
then print
"Download successful"
247 else if response
isa CurlResponseFailed then
248 sys
.stderr
.write
"Failed downloading URL '{self}' with: {response.error_msg} ({response.error_code})\n"
255 # Get this RSS feed content as an `Array[Element]`
256 fun to_rss_elements
: Array[Element]
258 var title_re
= "<title><![^/]*</title>".to_re
259 var link_re
= "<link>[^<]*download[^<]*</link>".to_re
261 var title_prefix_len
= "<title><![CDATA[".length
262 var title_suffix_len
= "]]</title>".length
+1
264 var titles
= search_all
(title_re
)
265 var links
= search_all
(link_re
)
267 if sys
.verbose
then print
"\n# Found {titles.length} titles and {links.length} links"
268 assert titles
.length
== links
.length
270 var elements
= new Array[Element]
271 for i
in titles
.length
.times
do
272 var title
= titles
[i
].to_s
.substring
(title_prefix_len
, titles
[i
].length
- title_prefix_len
- title_suffix_len
)
273 var link
= links
[i
].to_s
.substring
(6, links
[i
].length
- 6 - 7)
274 elements
.add
new Element(title
, link
)
278 print
"# Found elements:"
279 print elements
.join
("\n")
286 # Implement this method in your module to configure this tool
287 fun tool_config
: nullable Config do return null
291 print
"This tool is not configured, take a look at the example `sample_config.nit`"
293 abort # For the flow only
296 var tool
= new Downloader(c
)