#!/usr/bin/perl
  # Filename:        site_index
  # Author:        David Ljung Madison <DaveSource.com>
  # See License:        http://MarginalHacks.com/License/
  my $VERSION=  2.02;
  # Description:        Multi-site HTML site index generator
  # Input:        Reads a file (or STDIN) of "domain <tab> path"
  use strict;
  use POSIX;
  use IO::File;
umask 022;
##################################################
  # Setup the variables
  ##################################################
  my $PROGNAME = $0;
  $PROGNAME =~ s|.*/||;
my $DEFAULT_INDEX        = "\.s?html?\$";
  my $DEFAULT_DEFAULT        = "index.html";
  my $DEFAULT_OUT        = "Site_Index";                # Directory.  Will overwrite!
  my $DEFAULT_DEPTH        = 0;        # Max depth
  my $DEFAULT_LINKS_PER_PAGE = 95;                        # Break up pages
  my $NO_INDEX        = ".no_index";                        # Don't index these directories
  my $NO_CONTENTS        = ".no_contents";                # Don't index contents
#########################
  # Usage
  #########################
  sub fatal {
  foreach my $msg (@_) { print STDERR "[$PROGNAME] ERROR:  $msg\n"; }
  exit(-1);
  }
sub debug {
  return unless $MAIN::DEBUG;
  foreach my $msg (@_) { print STDERR "[$PROGNAME] $msg\n"; }
  }
# Read in the domain info into @DOMAINS
  sub get_sites {
  my ($data,$file) = @_;
  open(FILE,"<$file") || usage("Can't read input [$file]");
  while (<FILE>) {
  chomp;
  s/^\s+//;
  s/\s+$//;
  my ($dom,$path,$importance) = split(/\t/,$_,3);
  # If it's not a domain, then it's just here for importance
  if ($dom =~ m|/|) {
  fatal("Non-domain entries [$dom] are useless without importance '1'")
  unless $importance==1;
  } else {
  # Domain
  push(@{$data->{domains}},$dom)
  unless $data->{domain}{$dom};
  $data->{domain}{$dom}{path} = $path
  unless $data->{domain}{$dom}{path};
  $data->{domain}{$dom}{importance} = $importance
  unless $data->{domain}{$dom}{importance} &&
  $data->{domain}{$dom}{importance} < $importance;
  }
  push(@{$data->{top}}, out_top_links($data,$dom,$path))
  if $importance==1;
  }
  close FILE;
 $data;
  }
sub usage {
  foreach my $msg (@_) { print STDERR "ERROR:  $msg\n"; }
 my $def_index = $DEFAULT_INDEX;
  $def_index =~ s/\./\\./g;
print STDERR <<END_USAGE;
Usage:\t$PROGNAME [-d] <domain info...>
  \tBuilds site indexes for multiple domains
Reads file(s) (or '-' for STDIN) for domain info:
  domain   path    [importance]
Options:
  -ignore <regexp>   Ignore these paths/files
  -index <regexp>    Index these types of files [default '$def_index']
  -default <page>    Default directory page [default '$DEFAULT_DEFAULT']
  -out <dir>         Where to put the site index [default '$DEFAULT_OUT']
  -lpp <num>         Number of links per site index page.
  -depth <num>       Maximum depth (0 ignored) [default '$DEFAULT_DEPTH']
  -noclean           Don't remove old index files
Examples:
  -ignore '/images\$'                  Ignore any directories named: "images"
  -ignore '/(images|thumbnails)\$'     Multiple ignores
  -ignore '/\\.'                       Ignore dot directories
  -index  '\\.(s?html?|txt)\$'          Index .shtm, .shtml, .htm, .html, .txt
Domains can have an optional "importance" value from 1-4:
  1)  List root link at the top of all site indexes (and treat as 2)
  2)  List in every site index first.
  3)  List in every site index.
  4)  Only a link to the top page appears in other indexes.
  5)  Doesn't appear in other indexes at all.
END_USAGE
  exit -1;
  }
sub parse_args {
  my (%data,@files);
 # Defaults
  $data{opt}{index} = $DEFAULT_INDEX;
  $data{opt}{default} = $DEFAULT_DEFAULT;
  $data{opt}{out} = $DEFAULT_OUT;
  $data{opt}{links_per_page} = $DEFAULT_LINKS_PER_PAGE;
  $data{opt}{depth} = $DEFAULT_DEPTH;
 while (my $arg=shift(@ARGV)) {
  if ($arg =~ /^-h$/) { usage(); }
  if ($arg =~ /^-d$/) { $MAIN::DEBUG=1; next; }
  if ($arg =~ /^-ignore(=(.+))?$/) { $data{opt}{ignore}= $2 ? $3 : shift @ARGV; next; }
  if ($arg =~ /^-index(=(.+))?$/) { $data{opt}{index}= $2 ? $3 : shift @ARGV; next; }
  if ($arg =~ /^-default(=(.+))?$/) { $data{opt}{default}= $2 ? $3 : shift @ARGV; next; }
  if ($arg =~ /^-out(=(.+))?$/) { $data{opt}{out}= $2 ? $3 : shift @ARGV; next; }
  if ($arg =~ /^-lpp(=(.+))?$/) { $data{opt}{links_per_page}= $2 ? $3 : shift @ARGV; next; }
  if ($arg =~ /^-depth(=(.+))?$/) { $data{opt}{depth}= $2 ? $3 : shift @ARGV; next; }
  if ($arg =~ /^-no_?clean$/) { $data{opt}{noclean}=1; next; }
  if ($arg =~ /^-./) { usage("Unknown option: $arg"); }
  push(@files,$arg);
  }
  #usage("No input defined") unless $file;
  push(@files,"-") unless @files;
map get_sites(\%data,$_), @files;
 unshift(@{$data{top}}, "<p><font size='+1'><b>Main links:</font></b><p>\n")
  if $data{top};
 \%data;
  }
#########################
  # Build a site index
  #########################
  sub index_last {        # Contents sorter, site_index goes last
  my ($data,$a,$b) = @_;
  return  1 if $a eq $data->{opt}{out};
  return -1 if $b eq $data->{opt}{out};
  return $a cmp $b;
  }
sub get_title {
  my ($data,$path,$file) = @_;
 my $title = undef;
  my $file = "$path/$file" if $path;
  $file .= "/$data->{opt}{default}" if -d $file;
return $data->{titles}{$file} if $data->{titles}{$file};
 return $title unless open(F,"<$file");
  my ($in_title,$done_title) = (0,0);
  while (<F>) {
  $in_title = 1 if s/.*<title[^>]*>\s*//i;
  $done_title = 1 if s/<\/title.*//i;
  $title .= $_ if $in_title;
  last if $done_title;
  #if (m|<title>\s*(\S.*)\s*(</title>)?|) { close F; return $1; }
  last if $.>30;        # Only read first 30 lines
  last if m|</head|;
  }
  close F;
  chomp $title;
  $title =~ s/[\n\r]/ /g;
  $title =~ s/</</g;
  $title =~ s/>/>/g;
  return $data->{titles}{$file}=$title;
  }
sub get_name {
  my ($data,$file) = @_;
  my $name = $file;
  $name =~ s/_/ /g;                # Underbar = space
  $name =~ s|$data->{opt}{index}||g;        # .html
  $name;
  }
sub out_top_links {
  my ($data,$dom,$fullpath) = @_;
  $fullpath = $fullpath || $dom;
  my ($path,$file) = ($fullpath =~ m|(.*)/([^/]+)$|) ? ($1, $2) : ("",$fullpath);
  my $name = get_name($data,$file);
  $dom = "http://$dom" unless $dom =~ m|://|;
  out_link($dom, 1, $name, get_title($data,$path,$file));
  }
sub out_link {
  my ($url, $lvl, $name, $title) = @_;
  my $out = " "x(4*$lvl);
  $url =~ s/'/%27/g;
  $out .= "<a href='$url'>$name</a>";
  $out .= " $title" if $name ne $title;
  $out .= "<br>\n";
  $out;
  }
sub domain_header {
  my ($data, $domain) = @_;
 # Find any domain aliases
  my $path = $data->{domain}{$domain}{path};
  my @domains = grep($path && $data->{domain}{$_}{path} eq $path, @{$data->{domains}});
  push(@domains,$domain) unless $path;
  my $str = join(" / ", map("<a href='http://$_/'>$_</a>", @domains));
 "<p><font size='+1'><b>$str</b></font><p>\n";
  }
sub no_index {
  my ($data,$dir,$path) = @_;
 # No index?
  return 1 if -f "$path/$NO_INDEX";
 # Skip it if it's the path of one of our subdomains, let that index it
  foreach my $dom ( @{$data->{domains}} ) {
  my $dom_path = $data->{domain}{$dom}{path};
  next unless $dom_path;
 # $dom is a subdomain of $dir (and not, for example, the other way around)
  next unless $dom_path =~ /^$dir./;
  # And this is in the path of the subdomain
  next unless $path =~ /^$dom_path/;
  return 1;
  }
  return 0;
  }
sub get_directory {
  my ($data, $domain, $dir, $path, $lvl) = @_;
 return unless $dir;
  return if $data->{opt}{ignore} && $path =~ /$data->{opt}{ignore}/;
  return if $data->{opt}{depth} && $data->{opt}{depth} < $lvl;
my @links;
 # If we're level 0, then put in the domain header
  unless ($lvl) {
  push(@links, domain_header($data,$domain));
  $lvl++;
  }
 # Read the directory
  opendir(DIR, $path) || fatal("Couldn't read directory [$path]\n");
  my @dir = grep(-d "$path/$_" || /$data->{opt}{index}/ && !/$data->{opt}{default}/,
  grep(!/^\.{1,2}$/, readdir(DIR)) );
  @dir = grep( "$path/$_" !~ /$data->{opt}{ignore}/, @dir) if $data->{opt}{ignore};
  closedir(DIR);
 # Handle directories and html
  return unless (@dir);
  my $url_path = $path;
  $url_path =~ s|^$dir|http://$domain|g;
  foreach my $file ( sort { index_last($data,$a,$b) } @dir ) {
  my $name = get_name($data,$file);
 my $title = get_title($data,$path,$file);
  my $url = "$url_path/$file";
  unless (-d "$path/$file") {
  push(@links, out_link($url, $lvl, $name, $title));
  } elsif (!no_index($data,$dir,"$path/$file")) {
  my @dir_links = get_directory($data, $domain, $dir, "$path/$file", $lvl+1)
  unless (-f "$path/$file/$NO_CONTENTS" || $file eq $data->{opt}{out});
  push(@links, out_link("$url/", $lvl, $name, $title), @dir_links)
  if @dir_links || -f "$path/$file/$data->{opt}{default}";
  }
  }
  @links;
  }
#########################
  # Output
  #########################
  sub page_index {
  my ($page) = @_;
  ($page==1) ? "index.html" : "index.$page.html";
  }
sub start_index {
  my ($data, $out, $domain, $page, $pages) = @_;
 my $path = $data->{domain}{$domain}{path};
  my $file = "$path/$data->{opt}{out}/";
  mkdir($file, 0755) unless -d $file;
  $file .= page_index($page);
  open($out,">$file")
  || fatal("Can't write site index [$file]\n");
 print $out <<END_OF_HEADER;
  <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
  <html>
  <head>
  <title>
  Hierarchy:  $domain
  </title>
  <meta http-equiv="content-type" content="text/html;charset=ISO-8859-1">
  </head>
  <body bgcolor=white>
  Hierarchy for local domains, generated by the <a href='http://MarginalHacks.com/'>MarginalHacks</a> tool <a href='http://MarginalHacks.com/Hacks/site_index/'>$PROGNAME</a><p>
 <p><hr><p>
  END_OF_HEADER
 return $file unless $pages>1;
  print $out "  <table width='100%'><tr valign=top>\n";
  print $out "    <td width=60 bgcolor='#aaaaaa'>\n";
  foreach my $p ( 1..$pages ) {
  my $url = page_index($p);
  print $out ($page==$p) ?
  "      Page $p<br>\n" :
  "      <a href='$url'>Page $p</a><br>\n";
  }
  print $out "    </td>\n";
  print $out "    <td width='5'> </td>\n";
  print $out "    <td>\n";
  $file;
  }
sub end_index {
  my ($data, $out, $file, $pages) = @_;
 print $out <<END_PAGES_TABLE if $pages>1;
  </td>
  </tr></table>
  END_PAGES_TABLE
 my $date = localtime;
  print $out <<END_OF_FOOTER;
  <p><hr><p>
  Generated on $date;
  </body>
  </html>
  END_OF_FOOTER
 close($out);
  print "Wrote $file\n";
  }
sub clean_index {
  my ($data,$domain,$pages) = @_;
  my $path = $data->{domain}{$domain}{path};
  my $f;
  while (++$pages && ($f="$path/$data->{opt}{out}/".page_index($pages)) && -f $f) {
  unlink $f;
  print "Removed: $f\n";
  }
  }
sub output {
  my ($data, $domain, $show) = @_;
 # How many pages?
  my $lpp = $data->{opt}{links_per_page};
  my $num = $#$show+1;
  my $pages = POSIX::ceil($num/$lpp);
  # The page links themselves count.. (and so do the two credit links)
  while ($pages*$lpp < $num+($pages*($pages-1))+2) {
  if ($lpp<$pages) {        # Trouble
  print STDERR "Not enough links-per-page, increasing\n";
  $lpp*=1.25;
  $pages=int($pages/1.25);
  } else { $pages++; }
  }
 my $out = new IO::File;
  my $link = 0;
  for(my $p=1; $p<=$pages; $p++) {
  my $cnt = $pages-1;
  my $file = start_index($data, $out, $domain, $p, $pages);
  for( ; $link<=$#$show && $cnt<$lpp; $link++) {
  print $out $show->[$link];
  $cnt++;
  }
  end_index($data, $out, $file, $pages);
  }
  clean_index($data, $domain, $pages) unless $data->{opt}{noclean};
  }
##################################################
  # Main code
  ##################################################
# Domain sorter
  srand(time^$$);
  sub domains {
  my ($data,$a,$b) = @_;
  my $a_num = split(/\./,$a); my $b_num = split(/\./,$b);
# No - subdomains will be how we compare same-importance domains
  #  # Subdomains go last
  #  return $a_num <=> $b_num unless $a_num==$b_num;
 # Rate by importance for 2-5
  my $a_imp = $data->{domain}{$a}{importance};
  my $b_imp = $data->{domain}{$b}{importance};
  $a_imp = 5 unless $a_imp;
  $b_imp = 5 unless $b_imp;
  $a_imp = 2 if $a_imp<2;
  $b_imp = 2 if $b_imp<2;
  return $a_imp <=> $b_imp unless $a_imp==$b_imp;
 # Number of subdomains
  return $a_num <=> $b_num unless $a_num==$b_num;
#$a cmp $b;
 # Random otherwise
  int(rand(2))*2-1;
  }
sub main {
  my $data = parse_args();
 # Get the site index for each site
  my %did;
  foreach my $domain ( @{$data->{domains}} ) {
  my $path = $data->{domain}{$domain}{path};
  next unless $path;
  next if $did{$path}++;        # Skip if it's an alias
 print "Fetching index info: $domain\n";
  @{$data->{links}{$domain}} = get_directory($data, $domain, $path, $path, 0);
  delete $data->{links}{$domain} unless @{$data->{links}{$domain}}
  }
 # Write the site index for each site
  foreach my $domain ( @{$data->{domains}} ) {
  my @show = @{$data->{top}};
  next unless $data->{domain}{$domain}{path};
  next unless $data->{links}{$domain};
 push(@show, @{$data->{links}{$domain}});
  foreach my $show_domain ( sort { domains($data,$a,$b); } @{$data->{domains}} ) {
  my $show_importance = $data->{domain}{$show_domain}{importance};
  next if $show_domain eq $domain;
  next if $show_importance>=5;        # Only on their own index
  push(@show, domain_header($data,$show_domain))
  if $show_importance==4 || !$data->{links}{$show_domain};
  next if $show_importance==4;
  push(@show, @{$data->{links}{$show_domain}}) if $data->{links}{$show_domain};
  }
 output($data, $domain, \@show);
  }
  } main();
0 Comments:
Post a Comment