#!/usr/bin/perl
# Filename: site_index
# Author: David Ljung Madison <DaveSource.com>
# See License: http://MarginalHacks.com/License/
my $VERSION= 2.02;
# Description: Multi-site HTML site index generator
# Input: Reads a file (or STDIN) of "domain <tab> path"
use strict;
use POSIX;
use IO::File;
umask 022;
##################################################
# Setup the variables
##################################################
my $PROGNAME = $0;
$PROGNAME =~ s|.*/||;
my $DEFAULT_INDEX = "\.s?html?\$";
my $DEFAULT_DEFAULT = "index.html";
my $DEFAULT_OUT = "Site_Index"; # Directory. Will overwrite!
my $DEFAULT_DEPTH = 0; # Max depth
my $DEFAULT_LINKS_PER_PAGE = 95; # Break up pages
my $NO_INDEX = ".no_index"; # Don't index these directories
my $NO_CONTENTS = ".no_contents"; # Don't index contents
#########################
# Usage
#########################
sub fatal {
foreach my $msg (@_) { print STDERR "[$PROGNAME] ERROR: $msg\n"; }
exit(-1);
}
sub debug {
return unless $MAIN::DEBUG;
foreach my $msg (@_) { print STDERR "[$PROGNAME] $msg\n"; }
}
# Read in the domain info into @DOMAINS
sub get_sites {
my ($data,$file) = @_;
open(FILE,"<$file") || usage("Can't read input [$file]");
while (<FILE>) {
chomp;
s/^\s+//;
s/\s+$//;
my ($dom,$path,$importance) = split(/\t/,$_,3);
# If it's not a domain, then it's just here for importance
if ($dom =~ m|/|) {
fatal("Non-domain entries [$dom] are useless without importance '1'")
unless $importance==1;
} else {
# Domain
push(@{$data->{domains}},$dom)
unless $data->{domain}{$dom};
$data->{domain}{$dom}{path} = $path
unless $data->{domain}{$dom}{path};
$data->{domain}{$dom}{importance} = $importance
unless $data->{domain}{$dom}{importance} &&
$data->{domain}{$dom}{importance} < $importance;
}
push(@{$data->{top}}, out_top_links($data,$dom,$path))
if $importance==1;
}
close FILE;
$data;
}
sub usage {
foreach my $msg (@_) { print STDERR "ERROR: $msg\n"; }
my $def_index = $DEFAULT_INDEX;
$def_index =~ s/\./\\./g;
print STDERR <<END_USAGE;
Usage:\t$PROGNAME [-d] <domain info...>
\tBuilds site indexes for multiple domains
Reads file(s) (or '-' for STDIN) for domain info:
domain path [importance]
Options:
-ignore <regexp> Ignore these paths/files
-index <regexp> Index these types of files [default '$def_index']
-default <page> Default directory page [default '$DEFAULT_DEFAULT']
-out <dir> Where to put the site index [default '$DEFAULT_OUT']
-lpp <num> Number of links per site index page.
-depth <num> Maximum depth (0 ignored) [default '$DEFAULT_DEPTH']
-noclean Don't remove old index files
Examples:
-ignore '/images\$' Ignore any directories named: "images"
-ignore '/(images|thumbnails)\$' Multiple ignores
-ignore '/\\.' Ignore dot directories
-index '\\.(s?html?|txt)\$' Index .shtm, .shtml, .htm, .html, .txt
Domains can have an optional "importance" value from 1-4:
1) List root link at the top of all site indexes (and treat as 2)
2) List in every site index first.
3) List in every site index.
4) Only a link to the top page appears in other indexes.
5) Doesn't appear in other indexes at all.
END_USAGE
exit -1;
}
sub parse_args {
my (%data,@files);
# Defaults
$data{opt}{index} = $DEFAULT_INDEX;
$data{opt}{default} = $DEFAULT_DEFAULT;
$data{opt}{out} = $DEFAULT_OUT;
$data{opt}{links_per_page} = $DEFAULT_LINKS_PER_PAGE;
$data{opt}{depth} = $DEFAULT_DEPTH;
while (my $arg=shift(@ARGV)) {
if ($arg =~ /^-h$/) { usage(); }
if ($arg =~ /^-d$/) { $MAIN::DEBUG=1; next; }
if ($arg =~ /^-ignore(=(.+))?$/) { $data{opt}{ignore}= $2 ? $3 : shift @ARGV; next; }
if ($arg =~ /^-index(=(.+))?$/) { $data{opt}{index}= $2 ? $3 : shift @ARGV; next; }
if ($arg =~ /^-default(=(.+))?$/) { $data{opt}{default}= $2 ? $3 : shift @ARGV; next; }
if ($arg =~ /^-out(=(.+))?$/) { $data{opt}{out}= $2 ? $3 : shift @ARGV; next; }
if ($arg =~ /^-lpp(=(.+))?$/) { $data{opt}{links_per_page}= $2 ? $3 : shift @ARGV; next; }
if ($arg =~ /^-depth(=(.+))?$/) { $data{opt}{depth}= $2 ? $3 : shift @ARGV; next; }
if ($arg =~ /^-no_?clean$/) { $data{opt}{noclean}=1; next; }
if ($arg =~ /^-./) { usage("Unknown option: $arg"); }
push(@files,$arg);
}
#usage("No input defined") unless $file;
push(@files,"-") unless @files;
map get_sites(\%data,$_), @files;
unshift(@{$data{top}}, "<p><font size='+1'><b>Main links:</font></b><p>\n")
if $data{top};
\%data;
}
#########################
# Build a site index
#########################
sub index_last { # Contents sorter, site_index goes last
my ($data,$a,$b) = @_;
return 1 if $a eq $data->{opt}{out};
return -1 if $b eq $data->{opt}{out};
return $a cmp $b;
}
sub get_title {
my ($data,$path,$file) = @_;
my $title = undef;
my $file = "$path/$file" if $path;
$file .= "/$data->{opt}{default}" if -d $file;
return $data->{titles}{$file} if $data->{titles}{$file};
return $title unless open(F,"<$file");
my ($in_title,$done_title) = (0,0);
while (<F>) {
$in_title = 1 if s/.*<title[^>]*>\s*//i;
$done_title = 1 if s/<\/title.*//i;
$title .= $_ if $in_title;
last if $done_title;
#if (m|<title>\s*(\S.*)\s*(</title>)?|) { close F; return $1; }
last if $.>30; # Only read first 30 lines
last if m|</head|;
}
close F;
chomp $title;
$title =~ s/[\n\r]/ /g;
$title =~ s/</</g;
$title =~ s/>/>/g;
return $data->{titles}{$file}=$title;
}
sub get_name {
my ($data,$file) = @_;
my $name = $file;
$name =~ s/_/ /g; # Underbar = space
$name =~ s|$data->{opt}{index}||g; # .html
$name;
}
sub out_top_links {
my ($data,$dom,$fullpath) = @_;
$fullpath = $fullpath || $dom;
my ($path,$file) = ($fullpath =~ m|(.*)/([^/]+)$|) ? ($1, $2) : ("",$fullpath);
my $name = get_name($data,$file);
$dom = "http://$dom" unless $dom =~ m|://|;
out_link($dom, 1, $name, get_title($data,$path,$file));
}
sub out_link {
my ($url, $lvl, $name, $title) = @_;
my $out = " "x(4*$lvl);
$url =~ s/'/%27/g;
$out .= "<a href='$url'>$name</a>";
$out .= " $title" if $name ne $title;
$out .= "<br>\n";
$out;
}
sub domain_header {
my ($data, $domain) = @_;
# Find any domain aliases
my $path = $data->{domain}{$domain}{path};
my @domains = grep($path && $data->{domain}{$_}{path} eq $path, @{$data->{domains}});
push(@domains,$domain) unless $path;
my $str = join(" / ", map("<a href='http://$_/'>$_</a>", @domains));
"<p><font size='+1'><b>$str</b></font><p>\n";
}
sub no_index {
my ($data,$dir,$path) = @_;
# No index?
return 1 if -f "$path/$NO_INDEX";
# Skip it if it's the path of one of our subdomains, let that index it
foreach my $dom ( @{$data->{domains}} ) {
my $dom_path = $data->{domain}{$dom}{path};
next unless $dom_path;
# $dom is a subdomain of $dir (and not, for example, the other way around)
next unless $dom_path =~ /^$dir./;
# And this is in the path of the subdomain
next unless $path =~ /^$dom_path/;
return 1;
}
return 0;
}
sub get_directory {
my ($data, $domain, $dir, $path, $lvl) = @_;
return unless $dir;
return if $data->{opt}{ignore} && $path =~ /$data->{opt}{ignore}/;
return if $data->{opt}{depth} && $data->{opt}{depth} < $lvl;
my @links;
# If we're level 0, then put in the domain header
unless ($lvl) {
push(@links, domain_header($data,$domain));
$lvl++;
}
# Read the directory
opendir(DIR, $path) || fatal("Couldn't read directory [$path]\n");
my @dir = grep(-d "$path/$_" || /$data->{opt}{index}/ && !/$data->{opt}{default}/,
grep(!/^\.{1,2}$/, readdir(DIR)) );
@dir = grep( "$path/$_" !~ /$data->{opt}{ignore}/, @dir) if $data->{opt}{ignore};
closedir(DIR);
# Handle directories and html
return unless (@dir);
my $url_path = $path;
$url_path =~ s|^$dir|http://$domain|g;
foreach my $file ( sort { index_last($data,$a,$b) } @dir ) {
my $name = get_name($data,$file);
my $title = get_title($data,$path,$file);
my $url = "$url_path/$file";
unless (-d "$path/$file") {
push(@links, out_link($url, $lvl, $name, $title));
} elsif (!no_index($data,$dir,"$path/$file")) {
my @dir_links = get_directory($data, $domain, $dir, "$path/$file", $lvl+1)
unless (-f "$path/$file/$NO_CONTENTS" || $file eq $data->{opt}{out});
push(@links, out_link("$url/", $lvl, $name, $title), @dir_links)
if @dir_links || -f "$path/$file/$data->{opt}{default}";
}
}
@links;
}
#########################
# Output
#########################
sub page_index {
my ($page) = @_;
($page==1) ? "index.html" : "index.$page.html";
}
sub start_index {
my ($data, $out, $domain, $page, $pages) = @_;
my $path = $data->{domain}{$domain}{path};
my $file = "$path/$data->{opt}{out}/";
mkdir($file, 0755) unless -d $file;
$file .= page_index($page);
open($out,">$file")
|| fatal("Can't write site index [$file]\n");
print $out <<END_OF_HEADER;
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
<html>
<head>
<title>
Hierarchy: $domain
</title>
<meta http-equiv="content-type" content="text/html;charset=ISO-8859-1">
</head>
<body bgcolor=white>
Hierarchy for local domains, generated by the <a href='http://MarginalHacks.com/'>MarginalHacks</a> tool <a href='http://MarginalHacks.com/Hacks/site_index/'>$PROGNAME</a><p>
<p><hr><p>
END_OF_HEADER
return $file unless $pages>1;
print $out " <table width='100%'><tr valign=top>\n";
print $out " <td width=60 bgcolor='#aaaaaa'>\n";
foreach my $p ( 1..$pages ) {
my $url = page_index($p);
print $out ($page==$p) ?
" Page $p<br>\n" :
" <a href='$url'>Page $p</a><br>\n";
}
print $out " </td>\n";
print $out " <td width='5'> </td>\n";
print $out " <td>\n";
$file;
}
sub end_index {
my ($data, $out, $file, $pages) = @_;
print $out <<END_PAGES_TABLE if $pages>1;
</td>
</tr></table>
END_PAGES_TABLE
my $date = localtime;
print $out <<END_OF_FOOTER;
<p><hr><p>
Generated on $date;
</body>
</html>
END_OF_FOOTER
close($out);
print "Wrote $file\n";
}
sub clean_index {
my ($data,$domain,$pages) = @_;
my $path = $data->{domain}{$domain}{path};
my $f;
while (++$pages && ($f="$path/$data->{opt}{out}/".page_index($pages)) && -f $f) {
unlink $f;
print "Removed: $f\n";
}
}
sub output {
my ($data, $domain, $show) = @_;
# How many pages?
my $lpp = $data->{opt}{links_per_page};
my $num = $#$show+1;
my $pages = POSIX::ceil($num/$lpp);
# The page links themselves count.. (and so do the two credit links)
while ($pages*$lpp < $num+($pages*($pages-1))+2) {
if ($lpp<$pages) { # Trouble
print STDERR "Not enough links-per-page, increasing\n";
$lpp*=1.25;
$pages=int($pages/1.25);
} else { $pages++; }
}
my $out = new IO::File;
my $link = 0;
for(my $p=1; $p<=$pages; $p++) {
my $cnt = $pages-1;
my $file = start_index($data, $out, $domain, $p, $pages);
for( ; $link<=$#$show && $cnt<$lpp; $link++) {
print $out $show->[$link];
$cnt++;
}
end_index($data, $out, $file, $pages);
}
clean_index($data, $domain, $pages) unless $data->{opt}{noclean};
}
##################################################
# Main code
##################################################
# Domain sorter
srand(time^$$);
sub domains {
my ($data,$a,$b) = @_;
my $a_num = split(/\./,$a); my $b_num = split(/\./,$b);
# No - subdomains will be how we compare same-importance domains
# # Subdomains go last
# return $a_num <=> $b_num unless $a_num==$b_num;
# Rate by importance for 2-5
my $a_imp = $data->{domain}{$a}{importance};
my $b_imp = $data->{domain}{$b}{importance};
$a_imp = 5 unless $a_imp;
$b_imp = 5 unless $b_imp;
$a_imp = 2 if $a_imp<2;
$b_imp = 2 if $b_imp<2;
return $a_imp <=> $b_imp unless $a_imp==$b_imp;
# Number of subdomains
return $a_num <=> $b_num unless $a_num==$b_num;
#$a cmp $b;
# Random otherwise
int(rand(2))*2-1;
}
sub main {
my $data = parse_args();
# Get the site index for each site
my %did;
foreach my $domain ( @{$data->{domains}} ) {
my $path = $data->{domain}{$domain}{path};
next unless $path;
next if $did{$path}++; # Skip if it's an alias
print "Fetching index info: $domain\n";
@{$data->{links}{$domain}} = get_directory($data, $domain, $path, $path, 0);
delete $data->{links}{$domain} unless @{$data->{links}{$domain}}
}
# Write the site index for each site
foreach my $domain ( @{$data->{domains}} ) {
my @show = @{$data->{top}};
next unless $data->{domain}{$domain}{path};
next unless $data->{links}{$domain};
push(@show, @{$data->{links}{$domain}});
foreach my $show_domain ( sort { domains($data,$a,$b); } @{$data->{domains}} ) {
my $show_importance = $data->{domain}{$show_domain}{importance};
next if $show_domain eq $domain;
next if $show_importance>=5; # Only on their own index
push(@show, domain_header($data,$show_domain))
if $show_importance==4 || !$data->{links}{$show_domain};
next if $show_importance==4;
push(@show, @{$data->{links}{$show_domain}}) if $data->{links}{$show_domain};
}
output($data, $domain, \@show);
}
} main();
0 Comments:
Post a Comment