#!/usr/bin/ruby # Fetches mailman raw archives from a base URI # # NOT suitable yet for untrusted archives due to use of curl. # # Dan Brickley # Requirements: a 'data/' subdirectory should exist. toc = 'http://lists.burri.to/pipermail/geowanking/' require 'open-uri' require 'pp' require 'zlib' gzips=[] def fetch (uri, fn='') puts "Archiving: #{uri} as #{fn} " `cd data; curl -O '#{uri}'` # only if we trust the toc page! # IMPORTANT # to generalise this, use pure ruby not commndline # see below for non-working attempt. ## no joy: # gz = open(uri).read # month = Zlib::Deflate.deflate(gz) # File.open("data2/"+fn, 'w') do |f2| # f2.puts(gz) # end # http://stdlib.rubyonrails.org/libdoc/zlib/rdoc/index.html end open( toc ) do |f| pp f.meta pp "Content-Type: " + f.content_type pp "last modified" + f.last_modified.to_s no = 1 # print the first three lines f.each do |line| # We want the gziped archived # [ Gzip'd Text 15 KB ] if (line =~ /href="([^"]+)">\[ Gzip/ ) puts "GZIP! #{$1} " gzips.push( $1) end end end gzips.each do |rel| puts "Gzip URI: #{rel}" fetch(toc+rel,rel) end