#!/usr/bin/perl -W -C ##!/usr/bin/perl -W -C -T # $Id: htmldu,v 1.2 2004/07/25 15:29:59 jan Exp $ ############################################################################### # htmldu - Determine the total size of web pages. # # Copyright (C) 2004 Jan Bretschneider # # # # This program is free software; you can redistribute it and/or # # modify it under the terms of the GNU General Public License # # as published by the Free Software Foundation; either version 2 # # of the License, or (at your option) any later version. # # # # This program is distributed in the hope that it will be useful, # # but WITHOUT ANY WARRANTY; without even the implied warranty of # # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # # GNU General Public License for more details. # # # # You should have received a copy of the GNU General Public License # # along with this program; if not, write to the Free Software # # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. # ############################################################################### ######################################################################## # used packages # ######################################################################## #use HTML::LinkExtor; use HTML::Parser; use LWP::UserAgent; use URI::file; use URI::WithBase; use File::stat; use File::Spec; use File::Basename; use Getopt::Long; #use File::KGlob; use Data::Dumper; use IO::File; use strict; # An error has occured while determining the size of a file. use constant SIZE_ERROR => -1; # The size of a remote file was not determined because use configured # this program to do so. use constant SIZE_REMOTE_IGNORED => -2; ######################################################################## # global variables # ######################################################################## my $programname = basename($0); my $usage = < 0, 'print_version' => 0, 'retrieve_remote_resources' => 0 ); ######################################################################## # main program # ######################################################################## # parse commandline arguments GetOptions('h|help' => \$config{'print_help'}, 'V|version' => \$config{'print_version'}, 'r|remote-resources' => \$config{'retrieve_remote_resources'}); # print usage information and exit if user requested help die $usage if $config{'print_help'}; if ($config{'print_version'}) { print "htmldu $version\n"; print 'Author: Jan Bretschneider '; print "\n\n"; print 'htmldu comes with ABSOLUTELY NO WARRANTY. This is free software '. "and you\ncan redistribute or modify it under the terms of the ". "GNU General Public License\nas published by the Free Software ". "Foundation.\n"; exit(0); } # no files were given on command -> print usage information an exit if ($#ARGV < 0) { print $programname.': Please give me one or more files or URLs to '. "process.\n". 'Also see "'.$programname.' --help" for help'."\n"; exit(0); } # switch to unbuffered select (STDERR); $| = 1; # if only one file or URI was passed if ($#ARGV == 0) { # print detailed statistics for that file my $htmlfile = new HTMLFile($ARGV[0]); $htmlfile->collect_report_data(); $htmlfile->print_report(); } # else multiple files were passed else { # print multiple file report my $htmlfiles = new SetOfHTMLFiles(@ARGV); $htmlfiles->collect_report_data(); $htmlfiles->print_report(); } # exit exit(0); ######################################################################## # subroutines # ######################################################################## # download_remote_resource($uri, \$final_uri, \$target_file_handle) # # This routine downloads the remote resource at the given URI $uri at a # temporary location, which is determined by the routine itself. If the # download was successful 1 (true) is returned and a file handle to the # temporary file is saved in $target_file_handle whose reference was # given to this routine. Also the final URI (after all redirects) is # saved in $final_uri. If the download failed, 0 (false) is returned # and the value of $target_path and $final_uri are undefined. sub download_remote_resource { my $uri = shift; my $final_uri = shift; my $target_file_handle = shift; # open a temporary file for saving the remote resource my $tmp_file = IO::File->new_tmpfile; if (!$tmp_file) { print STDERR "$programname: Unable to create temporary file: $!\n"; return 0; } # download the file my $ua = LWP::UserAgent->new; $ua->agent($programname." ".$version); $ua->timeout(15); my $response = $ua->get($uri); # if download was successful if ($response->is_success) { # save downloaded file to the temporay location print $tmp_file $response->content; seek($tmp_file, 0, SEEK_SET); # save temporary location for caller $$target_file_handle = $tmp_file; # return final URL (after redirects) to caller $$final_uri = $response->request->uri; # return success return 1; } # else download failed else { # print error message print STDERR $programname.': Unable to download '.$uri. ': '.$response->message."\n"; # return failure return 0; } } # is_http_uri($uri_string) # # This routine determines whether the given string looks like a URI for # a remote resource. (i.e. it begins with http:// or https://) sub is_http_uri { my $uri_string = shift; return $uri_string =~ m/^https?:\/\//i; } # normalized_uri_string($uri1, $uri2) # # This method gets passed a normalized URI (as a string) in $uri1 and an # URI (as string, not necessarily normalized) in $uri2. $uri2 is # absolute or relative to $uri1. Then this method returns the normalized # form of $uri2 (as a string). sub normalized_uri_string { my $uri1 = shift; my $uri2 = shift; my $u1; my $u2; # if $uri2 starts with "http" if (is_http_uri($uri2)) { # $uri2 is normalized, so return it return $uri2; } # $uri2 does not start with "http" # if $uri1 starts with "http" (clearly denotes remote resource) if (is_http_uri($uri1)) { # $uri2 denotes remote resource, too # create and return an absolute URI from relative $uri2 and base $uri1 $u2 = URI::WithBase->new($uri2, $uri1); return $u2->abs->as_string; } # neither $uri1 nor $uri2 starts with "http" -> they denote local # resources my $normalized_uri_string; # if $uri1 is absolute if (File::Spec->file_name_is_absolute($uri1)) { # construct absolute URI from absolute $uri1 and relative or absolute # $uri2 $u1 = URI::file->new($uri1); $u2 = URI::file->new($uri2); $normalized_uri_string = $u2->abs($u1)->file; } # else $uri1 is relative (to the current working directory) else { # create an URI that is relative to the current working # directory from the relative $uri1 and from $uri2 that is # relative to $uri1 $u1 = URI::file->new($uri1); $u2 = URI::file->new($uri2); my $u2abs = $u2->abs($u1->abs(URI::file->cwd)); $normalized_uri_string = $u2abs->rel(URI::file->cwd)->file; } return pseudo_content_negotiation($normalized_uri_string); } # pseudo_content_negotiation($path) # # Does pseudo content negotiation (partly emulating the Apache Option # MultiViews). It takes a string that contains the path to a file for # which content negotiation should be performed. If $path points to # a directory, this directory is searched for files with names like # index.*. Common filename extensions are tried. If such a file is found # its path is returned. If $path points to a existing file, its # path is returned (without change). Else all possible extensions are # tried. If a matching file is found, its path is returned. In any case, # if more than one possibility is found, the file with the largest size # is returned. (Remember: This tool is for keeping HTML pages small, so # returning the largest file is probably better than anything else.) # If no matching file could be found, the given path is returned. So # callers of this function should be prepared to handle file not found # errors while dealing with the return path. sub pseudo_content_negotiation { my $path = shift; # if given path exists and is a file if (-f $path) { # return unmodified path to this file return $path; } my @files; # if given path exists and is a directory if (-d $path) { my $dir = $path; # if given directory path has no trailing slash if ($dir !~ m/\/$/) { # append trailing slash $dir .= '/'; } # create set of all files matching index.* in this directory @files = glob_files($dir.'index.*'); } # else given path does not exist else { # create set of files matching given path plus '.*' (try all file # name extensions on given path) @files = glob_files($path.'.*'); } # if no matching files were found if ($#files < 0) { # return the original given path (other subroutines will # probably fail opening the given path or determining its size, # but they should handle this) return $path; } # if created set of possible files contains only one file if ($#files == 0) { # return this one file return $files[0]; } # assert: set of possible files contains more than one file # return path to file with largest size return file_with_largest_size(@files); } # glob_files($pattern) # # Does the same as the builtin function glob(), only returns files # (regular files or symbolic links to regular files). # As a reminder of what glob() does: In list context, returns a (possibly # empty) list of filename expansions on the value of $pattern such as the # standard Unix shell /bin/csh would do. sub glob_files { my $pattern = shift; my @glob_files; # do file name expansion with given pattern my @glob_all = glob($pattern); foreach my $path (@glob_all) { # search for files (not directories etc.) in all possible file # name expansions and save those in an extra list. if (-f $path) { push @glob_files, $path; } } # return all files (not directories etc.) found by the expansion return @glob_files; } # file_with_largest_size(@files) # # Returns the largest file from a given list of files. Files are given # by their path. The path to the largest file is returned. The paths may # denote local or remote files. Paths to remote files must be valid HTTP # URIs. The size of remote files is only determined if the user has # configured this program to do so (via commandline options). If no # files are given, undef is returned. sub file_with_largest_size { my @files = @_; # if given set of files is empty return undef return undef if ($#files < 0); # get the size for each given file my %file2size = get_file_sizes(@files); # sort by size my @files_sorted = sort { $file2size{$b} <=> $file2size{$a} } (keys %file2size); # return the largest file. return $files_sorted[0]; } # get_file_sizes(@files) # # Determines the size of all given files. Files are given by their # paths. The paths may denote local or remote files. Paths to remote # files must be valid HTTP URIs. The size of remote files is only # determined if the user has configured this program to do so (via # commandline options). Sizes are returned in bytes. They are returned # in a hash with the given files (paths) as keys and the sizes as # values. If no files are given, an empty hash is returned. As file # sizes these special values can be returned: # # SIZE_ERROR if size could not determined due to an error # SIZE_REMOTE_IGNORED if file is remote file that should be ignored sub get_file_sizes { my @files = @_; my %file2size = (); # for each given file foreach my $file (@files) { # if file is remote if (is_http_uri($file)) { # use function for remote files to determine size and save result $file2size{$file} = file_size_remote($file); } # else file is local else { # use function for local files to determine size and save result $file2size{$file} = file_size_local($file); } } # return all determined file sizes return %file2size; } # file_size_local($file) # # This methode determines and returns the size in bytes of the given # local file. If the size could not be determined SIZE_ERROR is # returned. sub file_size_local { my $file = shift; # stat the file my $file_stat = stat($file); # if an error occured. if (!$file_stat) { # print error message and return error code print STDERR "$programname: Unable to determine size of '$file': $!\n"; return SIZE_ERROR; } # assert: no error occured # return size of file in bytes. return $file_stat->size; } # file_size_remote($file) # # This method determines and returns the size in bytes of the given # remote file. The file must be given by a valid HTTP URI. If the user # configured this program not to determine the size of remote files, # then SIZE_REMOTE_IGNORED is returned. If an error occured SIZE_ERROR # is returned and an error message is printed to stderr. sub file_size_remote { my $file = shift; # if remote files should be ignored if (!$config{'retrieve_remote_resources'}) { print STDERR $programname.': Ignoring size of <'.$file.">. Use ". "-r option to determine size of remote resources.\n"; # return special value for this case return SIZE_REMOTE_IGNORED; } # assert: size of remote files should be determined # determine size of given file by issueing an HEAD request my $ua = LWP::UserAgent->new; $ua->agent($programname." ".$version); my $req = HTTP::Request->new(HEAD => $file); my $res = $ua->request($req); # if the request was successful if (($res->is_success) || ($res->is_redirect)) { # if server sent the the size of file if (exists $res->headers->{'content-length'}) { # return it return $res->headers->{'content-length'}; } # else server sent no file size else { # print error message print STDERR $programname.": Unable to determine size of <". $file.">. Server sent no content-length header.\n"; # return error code return SIZE_ERROR; } } # else request was not successful else { # print error message print STDERR $programname.": Unable to determine size of <". $file."> ".$res->status_line."\n"; # return error code return SIZE_ERROR; } } # print_report(%file2size) # # This function gets a hash that maps the URIs or paths of files to # sizes and prints it, one file with its size on each line, sorted by # size in descending order to stdout. The total size is printed last. sub print_report { my %file2size = @_; # sort given files by size my @files_sorted = sort { $file2size{$b} <=> $file2size{$a} } (keys %file2size); # determine the width of the size column (consider possibility of # undetermined file sizes) my $size_col_width = length($file2size{$files_sorted[0]}); my $size_total = 0; # for each file foreach my $file (@files_sorted) { # determine what to print in size column (consider undetermined # file sizes) my $size = $file2size{$file}; my $size_str = $size; # TODO don't print ignored or erroneous files for now. next if ($size == SIZE_ERROR || $size == SIZE_REMOTE_IGNORED); #if ($size == SIZE_ERROR) { $size_str = "error"; } #if ($size == SIZE_REMOTE_IGNORED) { $size_str = "ignored"; } # determine the number of blanks necessary between size and file # name, so that at the end everything is lined up my $space = ' ' x ($size_col_width - length($size_str) + 3); # print the file and its size (with necessary space; consider # undetermined file sized) print $size_str.$space.$file."\n"; # add file size to total size, if size is valid if ($size > 0) { $size_total += $size; } } # print total size my $space = ' ' x ($size_col_width - length($size_total) + 3); print $size_total.$space."total\n"; } ######################################################################## # packages # ######################################################################## #----------------------------------------------------------------------- package HTMLFile; #----------------------------------------------------------------------- # public # $htmlfile = new HTMLFile($uri) # # This method creates a new HTMLFile object given a path or URI (as a # string). sub new { my ($package_name, $uri_string) = @_; my $uri; my $is_absolute; my $is_remote; my $htmlfile; # if given URI denotes remote file (given by HTTP address) if (::is_http_uri($uri_string)) { # save URI for that file using URI $uri = new URI($uri_string)->canonical; # URIs for remote files are considered absolute $is_absolute = 1; # remember that URI references remote file $is_remote = 1; } # else given URI denotes local file else { # save URI for that file using URI::file $uri = new URI::file($uri_string); # save whether given path is absolute or relative $is_absolute = File::Spec->file_name_is_absolute($uri_string); # remember that URI references local file $is_remote = 0; } # create object and save private data $htmlfile = { uri => $uri, uri_is_absolute => $is_absolute, is_remote => $is_remote, resources => {} }; # return created object bless $htmlfile; return $htmlfile; } # public # $htmlfile->collect_report_data() # # This method collects all data that is necessary to print a report for # this HTMLFile. That is the size of every resource directly or # indirectly referenced in this HTMLFile. It decides depending on the # configuration options whether remote resources or their sizes are # retrieved or not. sub collect_report_data { my $self = shift; # Retrieve and save the URIs of all resources that are directly or # indirectly referenced by this HTMLFile. $self->collect_resource_uris(); # Retrieve and save the size of each found resources $self->collect_resource_sizes(); } # private # $htmlfile->collect_resource_uris() # # This method retrieves and saves the URIs of all resources that are # directly or indirectly referenced by this HTMLFile. The results are # saved in the hash HTMLFile->{resources}. It decides depending on the # configuration options whether remote resources are retrieved or not. sub collect_resource_uris { my $self = shift; # create a set of to be parsed html resources and add the normalized # URI of the htmlfile itself to this set my %unparsed_html_resources = ( $self->normalized_uri_string() => 1 ); # while there is a html file in the set of to be parsed resources while (keys(%unparsed_html_resources)) { # take one html file out of the set of to be parsed resources # and add the file itself to its (whole) resources my @unparsed_html_resources_keys = keys(%unparsed_html_resources); my $htmlfile_normalized_uri_string = $unparsed_html_resources_keys[0]; delete $unparsed_html_resources{$htmlfile_normalized_uri_string}; $self->{resources}->{$htmlfile_normalized_uri_string} = 0; my $htmlfile; # if the chosen html file lies at a remote location if (::is_http_uri($htmlfile_normalized_uri_string)) { # if the user configured that remote resources should be retrieved if ($config{'retrieve_remote_resources'}) { # retrieve the html file and save it to a temporary location # the location of the retrieved file is save in $htmlfile my $final_uri; # if retrieval was successful if (::download_remote_resource( $htmlfile_normalized_uri_string, \$final_uri, \$htmlfile)) { # save final URI as the one we will be working with $htmlfile_normalized_uri_string = $$final_uri; } # else retrieval failed else { # skip to next resource next; } } # else html is remote resource but should not be retrieved else { # skip parsing it next; } } # else the chosen html file is local else { # the given URL can directly be used as the file that is # actually $htmlfile = $htmlfile_normalized_uri_string; } # parse the html file and let the parser save every html tag that # references a resource into a structure that can be easily examined my $parser = HTML::Parser->new(api_version => 3); $parser->report_tags(qw(img bgsound input embed script link body applet object param frame iframe layer ilayer q)); my @tags; $parser->handler(start => \@tags, 'tag, attr'); $parser->parse_file($htmlfile); # for each such html tag the parser found foreach my $tag (@tags) { # if found tag is an , , , or