From: Jean Privat Date: Mon, 4 Jul 2016 15:25:02 +0000 (-0400) Subject: nitwebcrawl: add a simple crawler for nitweb X-Git-Url: http://nitlanguage.org nitwebcrawl: add a simple crawler for nitweb Signed-off-by: Jean Privat --- diff --git a/src/examples/nitwebcrawl.nit b/src/examples/nitwebcrawl.nit new file mode 100644 index 0000000..1187476 --- /dev/null +++ b/src/examples/nitwebcrawl.nit @@ -0,0 +1,90 @@ +# This file is part of NIT ( http://www.nitlanguage.org ). +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Crawler on the nitweb web API +module nitwebcrawl + +import json + +# Download a HTTP resource +fun curl(url: String): String do + # TODO: use curl.nit + var p = new ProcessReader("curl", "-s", url) + var res = p.read_all + p.close + p.wait + # TODO: process HTTP error codes + if p.status != 0 then + print "Error with {url}" + end + return res +end + +# Recursively collect all string values in a json value associated to a given key. +fun search_json(json: nullable Jsonable, key: String, result: nullable Array[String]): Array[String] +do + if result == null then result = new Array[String] + if json isa JsonObject then + for k, v in json do + search_json(v, key, result) + end + var v = json.get_or_null(key) + if v != null then + assert v isa String + result.add v + end + else if json isa JsonArray then + for e in json do search_json(e, key, result) + else if json == null or json isa String or json isa Int or json isa Bool or json isa Float then + # nop + else + print json.class_name + abort + end + return result +end + +var server = "http://localhost:3000" +var todo = ["/api/entity/core"] + +if args.not_empty then + server = args.pop + if args.not_empty then + todo.clear + todo.add_all args + end +end + +var seen = new Set[String] +seen.add_all todo + +var cpt = 0 +while todo.not_empty do + cpt += 1 + var url = todo.pop + url = server + url + print "process {url}. {cpt}+{todo.length}/{seen.length}" + + var js = curl(url) + var txt = js.parse_json + if txt isa Error then + print "{url}: {txt.message}" + continue + end + for x in search_json(txt, "api_url") do + if seen.has(x) then continue + todo.add x + seen.add x + end +end diff --git a/tests/sav/nitwebcrawl.res b/tests/sav/nitwebcrawl.res new file mode 100644 index 0000000..caa407f --- /dev/null +++ b/tests/sav/nitwebcrawl.res @@ -0,0 +1,3 @@ +process http://localhost:3000/api/entity/core. 1+0/1 +Error with http://localhost:3000/api/entity/core +http://localhost:3000/api/entity/core: Unexpected Eof; is acceptable instead: value