class Hawler
Attributes
brute[RW]
debug[RW]
depth[RW]
force[RW]
headers[RW]
help[RW]
peek[RW]
proxy[RW]
proxyport[RW]
recurse[RW]
sleep[RW]
types[RW]
verbose[RW]
Public Class Methods
new(uri, block)
click to toggle source
# File lib/hawler.rb, line 69 def initialize(uri, block) unless (uri =~ /^https?:\/\//) uri = "http://#{uri}" end @uri = uri @block = block @links = {} @recurse = false @verbose = false @debug = false @depth = nil @sleep = 0 @done = false @force = false @brute = false @peek = false @types = Hash[ *%w(text/html text/xml application/xml).collect { |v| [v,1] }.flatten ] @headers = {} @proxy = nil @proxyport = nil # register some signal handlers. halt on ctrl-c, enable verbose on SIGUSR1 # and enable debug on SIGUSR2 Signal.trap("INT", lambda { @done = true and puts "Terminating -- ctrl-c" }) Signal.trap("USR1", lambda { @verbose = !@verbose and puts "Enabling verbose mode" }) Signal.trap("USR2", lambda { @debug = !@debug and puts "Enabling debug mode" }) end
Public Instance Methods
generate_hawlee(link, hawlee)
click to toggle source
Simple helper to create a new Hawlee
# File lib/hawler.rb, line 64 def generate_hawlee(link, hawlee) print_debug("Queuing #{link} for processing") Hawlee.new(link, hawlee.uri, hawlee.depth + 1) end
start()
click to toggle source
Start the Hawler.
# File lib/hawler.rb, line 100 def start if (!@recurse) @depth = 0 end @uri = HawlerHelper.valid_uri(@uri) or exit(1) hawl(@uri) end
Private Instance Methods
do_once(uri, referer, what, block)
click to toggle source
For every every URI, do something called what
which consists
of executing block
# File lib/hawler.rb, line 112 def do_once(uri, referer, what, block) unless (@links[uri]) @links[uri] = Hawlee.new(uri, referer, 0) end if (@links[uri].send("#{what}?")) print_debug("Skipping #{uri} (referer #{referer}) -- '#{what}' already called") else print_verbose("Calling #{what} on #{uri} (referer #{referer})") @links[uri].send("#{what}") return block.call end end
hawl(uri)
click to toggle source
# File lib/hawler.rb, line 126 def hawl(uri) # sucks to have to use an array for this, but # order is important to achieve something that is close # to a breadth-first search links_to_process = [] links_to_process << Hawlee.new(uri, nil, 0) while (!links_to_process.empty?) cur_hawlee = links_to_process.shift if (HawlerHelper.offsite?(uri, cur_hawlee.uri)) unless (@force) print_debug("Skipping offsite URI #{cur_hawlee}") next end end if (@peek) do_once(cur_hawlee.uri, cur_hawlee.referer, :head, lambda { if (@depth && cur_hawlee.depth > @depth) print_debug("Max recursion depth of #{@depth} at #{cur_hawlee.uri}") return false end peek_response = HawlerHelper.head(cur_hawlee.uri, cur_hawlee.referer, @headers, @proxy, @proxyport) if (peek_response.nil?) return false else case peek_response when Net::HTTPRedirection if (HawlerHelper.valid_uri(peek_response['location'])) redirect = uri.merge(peek_response['location']) links_to_process << generate_hawlee(redirect, cur_hawlee) return false end end # only pass this URI on for retrieval if it's # Content-Type is one that is likely to have links in it. if (peek_response.key?("Content-Type")) c = peek_response["Content-Type"] c.gsub!(/;.*/, "") if (@types["#{c}"]) return true else return false end else return true end end }) or next end response = nil do_once(cur_hawlee.uri, cur_hawlee.referer, :get, lambda { if (@depth && cur_hawlee.depth > @depth) print_debug("Max recursion depth of #{@depth} at #{cur_hawlee.uri}") else response = HawlerHelper.get(cur_hawlee.uri, cur_hawlee.referer, @headers, @proxy, @proxyport) unless (response.nil?) case response when Net::HTTPRedirection if (HawlerHelper.valid_uri(response['location'])) redirect = uri.merge(response['location']) links_to_process << generate_hawlee(redirect, cur_hawlee) end end end end }) unless (response.nil?) case response when Net::HTTPRedirection when Net::HTTPSuccess do_once(cur_hawlee.uri, cur_hawlee.referer, :harvest, lambda { HawlerHelper.harvest(cur_hawlee.uri, response.body).each do |l| links_to_process << generate_hawlee(l, cur_hawlee) end if (@brute) HawlerHelper.brute_from_uri(cur_hawlee.uri).each do |b| links_to_process << generate_hawlee(b, cur_hawlee) end HawlerHelper.brute_from_data(cur_hawlee.uri, response.body) do |b| links_to_process << generate_hawlee(b, cur_hawlee) end end }) end end do_once(cur_hawlee.uri, cur_hawlee.referer, :analyze, lambda { @block.call(cur_hawlee.uri, cur_hawlee.referer, response) } ) break if (@done) Kernel.sleep(@sleep) if (@sleep) end end
print_debug(msg)
click to toggle source
Print debug messages if so desired
# File lib/hawler.rb, line 229 def print_debug(msg) puts msg if (@debug) STDOUT.flush end
print_verbose(msg)
click to toggle source
Print verbose messages if so desired
# File lib/hawler.rb, line 235 def print_verbose(msg) puts msg if (@verbose) STDOUT.flush end