contrib: intro rss_downloader

author Alexis Laferrière <alexis.laf@xymus.net>

Mon, 22 Sep 2014 17:50:30 +0000 (13:50 -0400)

committer Alexis Laferrière <alexis.laf@xymus.net>

Sun, 14 Dec 2014 21:08:38 +0000 (16:08 -0500)
author Alexis Laferrière <alexis.laf@xymus.net>
Mon, 22 Sep 2014 17:50:30 +0000 (13:50 -0400)
committer Alexis Laferrière <alexis.laf@xymus.net>
Sun, 14 Dec 2014 21:08:38 +0000 (16:08 -0500)
diff --git a/contrib/rss_downloader/Makefile b/contrib/rss_downloader/Makefile

new file mode 100644 (file)

index 0000000..3c71d2e
--- /dev/null
+++ b/contrib/rss_downloader/Makefile
@@ -0,0 +1,3 @@
+all:
+       mkdir -p bin/
+       ../../bin/nitc --dir bin/ src/*.nit
diff --git a/contrib/rss_downloader/README.md b/contrib/rss_downloader/README.md

new file mode 100644 (file)

index 0000000..eddc63b
--- /dev/null
+++ b/contrib/rss_downloader/README.md
@@ -0,0 +1,15 @@
+This tool downloads files pointed by RSS feeds.
+
+It fetches the content of the RSS feed, then filters it according to custom patterns and to existing folders on disk. Selected elements will be downloaded to the given folder.
+
+This tool must be compiled with its configuration in order to work. See the `sample_config.nit` in `src/`.
+
+# Features and TODO
+
+- [x] Download & parse RSS file
+- [x] Generate regular expressions from local folders
+- [x] Auto download files matching regular expressions
+- [x] Memory to prevent double downloads
+- [ ] Blacklist
+- [ ] Use a cleaner RSS parser
+- [ ] Tolerate more feed formats
diff --git a/contrib/rss_downloader/src/rss_downloader.nit b/contrib/rss_downloader/src/rss_downloader.nit

new file mode 100644 (file)

index 0000000..eacaf79
--- /dev/null
+++ b/contrib/rss_downloader/src/rss_downloader.nit
@@ -0,0 +1,297 @@
+# This file is part of NIT ( http://www.nitlanguage.org ).
+#
+# Copyright 2014 Alexis Laferrière <alexis.laf@xymus.net>
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Downloads files from RSS feeds
+module rss_downloader
+
+import curl
+
+redef class Sys
+       # Lazy man's verbose option
+       var verbose: Bool = args.has("-v") or args.has("--verbose") is lazy
+end
+
+# Program configuration
+class Config
+
+       # Folders used to infer regex (may be empty)
+       var regex_source_folders: Array[Path]
+
+       # Custom patterns (may be empty)
+       var custom_patterns: Array[Pattern]
+
+       # Download destination
+       var download_destination_folder: Path
+
+       # RSS feeds (needs at least one)
+       var rss_source_urls: Array[Text]
+
+       # Path to the log file
+       var log_path: Path
+
+       # Unique path of files names to prevent double download (may be empty)
+       var unique_pattern: Array[Pattern]
+
+       # Exception where we ignore uniqueness and can be downloaded again (may be empty)
+       var unique_exceptions: Array[Pattern]
+end
+
+# An element from an RSS feed
+class Element
+       # Tile
+       var title: String
+
+       # Link to the file to download
+       var link: String
+
+       redef fun to_s do return "{title} @ {link}"
+
+       # Download this element to `path`
+       fun download_to(path: Text)
+       do
+               var curl = new Curl
+
+               var request = new CurlHTTPRequest(link, curl)
+               var response = request.download_to_file(path.to_s)
+
+               if response isa CurlFileResponseSuccess then
+                       curl.destroy
+               else if response isa CurlResponseFailed then
+                       sys.stderr.write "Failed downloading URL '{link}' with: {response.error_msg} ({response.error_code})\n"
+               else abort
+       end
+
+       # Get an unique identifier for this element, uses `Config::unique_pattern`
+       fun unique_id(config: Config): String
+       do
+               for re in config.unique_pattern do
+                       var match = title.search(re)
+                       if match != null then
+                               return title.substring(0, match.after).to_lower
+                       end
+               end
+
+               return title
+       end
+
+       # Is this element except from uniqueness?
+       fun is_unique_exception(config: Config): Bool
+       do
+               for re in config.unique_exceptions do
+                       if title.has(re) then
+                               return true
+                       end
+               end
+               return false
+       end
+end
+
+# Main program structure
+class Downloader
+       # Configuration
+       var config: Config
+
+       # Local history (read from, them written to file)
+       var history = new HashSet[Text]
+
+       # Execute tool
+       fun run
+       do
+               # Read old log from file
+               if config.log_path.exists then
+                       var stream = config.log_path.open_ro
+                       history.add_all stream.read_all.split("\n")
+                       stream.close
+               end
+
+               # Get the pattern to search for
+               var patterns = self.patterns
+
+               # Get all the elements from the RSS feeds
+               var elements = new HashSet[Element]
+               for rss_url in config.rss_source_urls do
+                       var rss = rss_url.fetch_rss_content
+                       elements.add_all rss.to_rss_elements
+               end
+
+               # Select the elements matching our pattern
+               var matches = new HashSet[Element]
+               for pattern in patterns do for element in elements do
+                       if element.title.has(pattern) then
+                               matches.add element
+                       end
+               end
+
+               if sys.verbose then
+                       print "\n# {matches.length} matching elements:"
+                       print matches.join("\n")
+                       print "\n# Downloading..."
+               end
+
+               for element in matches do
+                       var local_path = config.download_destination_folder.to_s / element.title
+                       var unique_id = element.unique_id(config)
+
+                       if local_path.to_path.exists then
+                               # Do not redownload a file (we assume that the file name is unique by itself)
+                               if sys.verbose then print "File exists, skipping {element}"
+                               continue
+                       else if history.has(unique_id) then
+                               # Do not download a file that is not unique according to `unique_id`
+                               if not element.is_unique_exception(config) then
+                                       # We make some exceptions
+                                       if sys.verbose then print "File in log, skipping {element}"
+                                       continue
+                               end
+                       end
+
+                       # Download element
+                       if sys.verbose then print "Fetching {element} as {local_path}"
+                       element.download_to(local_path)
+
+                       # Add `unique_id` to log
+                       history.add unique_id
+               end
+
+               # Save new log to file
+               var stream = config.log_path.open_wo
+               for line in history do
+                       stream.write line
+                       stream.write "\n"
+               end
+               stream.close
+       end
+
+       # Gather all patterns from `Config::custom_patterns` and `Config::source_folder_path`
+       fun patterns: Array[Pattern]
+       do
+               var patterns = new Array[Pattern]
+
+               # Begin with custom pattern
+               for pattern_source in config.custom_patterns do
+                       patterns.add pattern_source
+               end
+
+               # Get regex source from folder names
+               var folder_names = new HashSet[Text]
+               for source_folder_path in config.regex_source_folders do
+                       var source_folder = source_folder_path
+
+                       if not source_folder.exists then
+                               sys.stderr.write "Regex source folder '{source_folder_path}' does not exists.\n"
+                               continue
+                       end
+
+                       for dir in source_folder.files do if dir.stat.is_dir then
+                               folder_names.add dir.to_s
+                       end
+               end
+
+               # Compile our infered patterns
+               for folder_name in folder_names do
+                       # Transform from "Some folder name" to "^Some.folder.name"
+                       var regex_source = folder_name.
+                               replace(' ', ".").replace('[', "\\[").replace('(', "\\(").
+                               replace('+', "\\+").replace('*', "\\*")
+                       regex_source = "^" + regex_source
+
+                       var regex = regex_source.to_re
+                       regex.ignore_case = true
+
+                       patterns.add regex
+               end
+
+               if patterns.is_empty then
+                       sys.stderr.write "Do not have any pattern to work with.\n"
+                       exit 1
+               end
+
+               if sys.verbose then
+                       print "# Generated {patterns.length} patterns"
+                       print patterns.join("\n")
+               end
+
+               return patterns
+       end
+end
+
+redef class Text
+       # Get the content of the RSS feed at `self`
+       fun fetch_rss_content: Text
+       do
+               var curl = new Curl
+
+               if sys.verbose then print "\n# Downloading RSS file from '{self}'"
+
+               var request = new CurlHTTPRequest(to_s, curl)
+               var response = request.execute
+
+               if response isa CurlResponseSuccess then
+                       var body = response.body_str
+                       curl.destroy
+                       if sys.verbose then print "Download successful"
+                       return body
+               else if response isa CurlResponseFailed then
+                       sys.stderr.write "Failed downloading URL '{self}' with: {response.error_msg} ({response.error_code})\n"
+                       exit 1
+               end
+
+               abort
+       end
+
+       # Get this RSS feed content as an `Array[Element]`
+       fun to_rss_elements: Array[Element]
+       do
+               var title_re = "<title><![^/]*</title>".to_re
+               var link_re = "<link>[^<]*download[^<]*</link>".to_re
+
+               var title_prefix_len = "<title><![CDATA[".length
+               var title_suffix_len = "]]</title>".length+1
+
+               var titles = search_all(title_re)
+               var links = search_all(link_re)
+
+               if sys.verbose then print "\n# Found {titles.length} titles and {links.length} links"
+               assert titles.length == links.length
+
+               var elements = new Array[Element]
+               for i in titles.length.times do
+                       var title = titles[i].to_s.substring(title_prefix_len, titles[i].length - title_prefix_len - title_suffix_len)
+                       var link = links[i].to_s.substring(6, links[i].length - 6 - 7)
+                       elements.add new Element(title, link)
+               end
+
+               if sys.verbose then
+                       print "# Found elements:"
+                       print elements.join("\n")
+               end
+
+               return elements
+       end
+end
+
+# Implement this method in your module to configure this tool
+fun tool_config: nullable Config do return null
+
+var c = tool_config
+if c == null then
+       print "This tool is not configured, take a look at the example `sample_config.nit`"
+       exit 1
+       abort # For the flow only
+end
+
+var tool = new Downloader(c)
+tool.run
author	Alexis Laferrière <alexis.laf@xymus.net>
	Mon, 22 Sep 2014 17:50:30 +0000 (13:50 -0400)
committer	Alexis Laferrière <alexis.laf@xymus.net>
	Sun, 14 Dec 2014 21:08:38 +0000 (16:08 -0500)
contrib/rss_downloader/Makefile	[new file with mode: 0644]	patch \| blob
contrib/rss_downloader/README.md	[new file with mode: 0644]	patch \| blob
contrib/rss_downloader/src/rss_downloader.nit	[new file with mode: 0644]	patch \| blob