#!/usr/bin/perl # mirror.openwebmail.pl - mirror openwebmail with wget. # # 2004/07/15 tung@turtle.ee.ncku.edu.tw # # This script uses wget to do mirror for web url and # deletes files that don't exist on remote site # The stale files were found by comparing the local filelist with wget log. use strict; # The URL to be mirrored, don't forget the tailing / my $mirror_url="http://turtle.ee.ncku.edu.tw/openwebmail/"; # The remote path that you don't want to mirror, don't forget the tailing / # eg: my @excludelist= ( # "/openwebmail/download/old/", # "/openwebmail/download/redhat/rpm/daily-build/SRPMS/" # ); # my @excludelist= (); # The local directory for download my $downloaddir="/home/ftp/pub/openwebmail/"; # Set $delete_nonexist to 0 # if you don't want this script to delete files not existing on remote site my $delete_nonexist=1; # When $delete_nonexist is set to 1, # local dirs or files in the following list won't be deleted # eg: my @keeplist= ( # "/home/ftp/pub/openwebmail/dir1/", # "/home/ftp/pub/openwebmail/file1", # "/home/ftp/pub/openwebmail/file2" # ); # my @keeplist= (); # The wget executable my $wgetbin="/usr/local/bin/wget"; # The wget proxy my $wgetproxy=""; #my $wgetproxy="http://someproxy.somedomain:3128/"; # The location for log files my $wgetlog="/var/log/mirror.openwebmail.log"; my $dellog="/var/log/mirror.openwebmail.del.log"; ############## No further configuration is required since here ############## $_=(split(/\s/, $wgetbin))[0]; if (! -x $_) { print "Wget program $_ not found!\n"; exit 1; } if ($mirror_url !~ m!^http://!) { print "Invalid mirror url $mirror_url!\n"; exit 2; } $downloaddir=~s!/+$!!; if (! -d "$downloaddir") { print "Directory $downloaddir doesn't exist, mirror is canceled\n"; exit 3; } my ($cutdir_number, $remotedir, $excludeparm, @a); $_=$mirror_url; s!^http://!!; s!/$!!; @a=split(/\//, $_); $cutdir_number=$#a; shift @a; # remove hostname part from url $remotedir="/".join("/", @a); $excludeparm=join(",", @excludelist); $excludeparm="--exclude-directories=".$excludeparm if ($excludeparm ne ""); for my $i (0..$#excludelist) { if ($excludelist[$i]!~s!^$remotedir/!!) { print "excludelist member $excludelist[$i] is not under $remotedir\n"; exit 4; } } for my $i (0..$#keeplist) { $keeplist[$i].='/' if (-d $keeplist[$i] && $keeplist[$i]!~m!/$!); if ($keeplist[$i]!~s!^$downloaddir/!!) { print "keeplist member $keeplist[$i] is not under $downloaddir\n"; exit 5; } } ############## most variables have been initialized before here ############## chdir $downloaddir; unlink($wgetlog, $dellog); $ENV{'http_proxy'}=$wgetproxy; `$wgetbin $excludeparm --cache=off --reject=O=A,D=A,M=A,N=A,S=A,O=D,D=D,M=D,N=D,S=D --cut-dirs=$cutdir_number -m -nH -np -o $wgetlog $mirror_url`; my (%exist, $exist_count, $error_timeout, $error_noservice); if ($delete_nonexist) { if (!open(WGETLOG, $wgetlog)) { print "wget logfile open error ($!)\n"; exit 4; } while() { if (/=> `(.*)'/) { $exist{$1}=1; $exist_count++; } elsif (/Operation timed out/ || /Host is down/) { $error_timeout=1; } elsif (/Service Unavailable/) { $error_noservice=1; } } close(WGETLOG); } open(DELLOG, ">$dellog"); if ($delete_nonexist) { if ($exist_count<5) { $delete_nonexist=0; print DELLOG "### Unknown wget log format? exist_count<5, delete_nonexist canceled."; } elsif ($error_timeout) { $delete_nonexist=0; print DELLOG "### Operation timeout? delete_nonexist canceled."; } elsif ($error_noservice) { $delete_nonexist=0; print DELLOG "### Service unavailable? delete_nonexist canceled."; } } # del faked index.html and files that don't appear in wgetlog open(P, "find $downloaddir -type f|"); while(

) { chomp($_); $_=~s!^$downloaddir/!!; next if $0=~/$_$/; next if $wgetlog=~/$_$/; next if $dellog=~/$_$/; if ($_=~/index.html\?[CODMNS]=[ADMNS]/) { print DELLOG "del $_ (filelist)\n" if (unlink($_)); next; } if ($_=~/index.html$/) { my $buff; open(F, $_); read(F, $buff, 512); close(F); if ($buff=~/\QA HREF="?N=D"\E/ || $buff=~/\QA HREF="?O=D"\E/ ) { print DELLOG "del $_ (filelist)\n" if (unlink($_)); next; } } my $exclude_found=0; foreach my $exclude (@excludelist) { if ($_=~/^$exclude/) { $exclude_found=1; last; } } next if ($exclude_found); if ($delete_nonexist && !$exist{$_}) { my $keep_found=0; foreach my $keep (@keeplist) { if ($_=~/^$keep/) { $keep_found=1; last; } } print DELLOG "del $_\n" if (!$keep_found && unlink($_)); next; } } close(P); # remove empty dir open(P, "find $downloaddir -type d|sort -r|"); while (

) { chomp($_); next if ($_ eq $downloaddir); my $name_found=0; foreach my $name (@excludelist, @keeplist) { if ($_=~m!^$downloaddir/$name!) { $name_found=1; last; } } next if ($name_found); print DELLOG "rmdir $_ (empty dir)\n" if (rmdir($_)); } close(P); close(DELLOG);