10 Ağustos 2007 Cuma

Site Indexer for Domains

#!/usr/bin/perl

# Filename: site_index

# Author: David Ljung Madison <DaveSource.com>

# See License: http://MarginalHacks.com/License/

my $VERSION= 2.02;

# Description: Multi-site HTML site index generator

# Input: Reads a file (or STDIN) of "domain <tab> path"

use strict;

use POSIX;

use IO::File;


umask 022;


##################################################

# Setup the variables

##################################################

my $PROGNAME = $0;

$PROGNAME =~ s|.*/||;


my $DEFAULT_INDEX = "\.s?html?\$";

my $DEFAULT_DEFAULT = "index.html";

my $DEFAULT_OUT = "Site_Index"; # Directory. Will overwrite!

my $DEFAULT_DEPTH = 0; # Max depth

my $DEFAULT_LINKS_PER_PAGE = 95; # Break up pages

my $NO_INDEX = ".no_index"; # Don't index these directories

my $NO_CONTENTS = ".no_contents"; # Don't index contents


#########################

# Usage

#########################

sub fatal {

foreach my $msg (@_) { print STDERR "[$PROGNAME] ERROR: $msg\n"; }

exit(-1);

}


sub debug {

return unless $MAIN::DEBUG;

foreach my $msg (@_) { print STDERR "[$PROGNAME] $msg\n"; }

}


# Read in the domain info into @DOMAINS

sub get_sites {

my ($data,$file) = @_;

open(FILE,"<$file") || usage("Can't read input [$file]");

while (<FILE>) {

chomp;

s/^\s+//;

s/\s+$//;

my ($dom,$path,$importance) = split(/\t/,$_,3);

# If it's not a domain, then it's just here for importance

if ($dom =~ m|/|) {

fatal("Non-domain entries [$dom] are useless without importance '1'")

unless $importance==1;

} else {

# Domain

push(@{$data->{domains}},$dom)

unless $data->{domain}{$dom};

$data->{domain}{$dom}{path} = $path

unless $data->{domain}{$dom}{path};

$data->{domain}{$dom}{importance} = $importance

unless $data->{domain}{$dom}{importance} &&

$data->{domain}{$dom}{importance} < $importance;

}

push(@{$data->{top}}, out_top_links($data,$dom,$path))

if $importance==1;

}

close FILE;


$data;

}


sub usage {

foreach my $msg (@_) { print STDERR "ERROR: $msg\n"; }


my $def_index = $DEFAULT_INDEX;

$def_index =~ s/\./\\./g;


print STDERR <<END_USAGE;


Usage:\t$PROGNAME [-d] <domain info...>

\tBuilds site indexes for multiple domains


Reads file(s) (or '-' for STDIN) for domain info:

domain path [importance]


Options:

-ignore <regexp> Ignore these paths/files

-index <regexp> Index these types of files [default '$def_index']

-default <page> Default directory page [default '$DEFAULT_DEFAULT']

-out <dir> Where to put the site index [default '$DEFAULT_OUT']

-lpp <num> Number of links per site index page.

-depth <num> Maximum depth (0 ignored) [default '$DEFAULT_DEPTH']

-noclean Don't remove old index files


Examples:

-ignore '/images\$' Ignore any directories named: "images"

-ignore '/(images|thumbnails)\$' Multiple ignores

-ignore '/\\.' Ignore dot directories

-index '\\.(s?html?|txt)\$' Index .shtm, .shtml, .htm, .html, .txt


Domains can have an optional "importance" value from 1-4:

1) List root link at the top of all site indexes (and treat as 2)

2) List in every site index first.

3) List in every site index.

4) Only a link to the top page appears in other indexes.

5) Doesn't appear in other indexes at all.


END_USAGE

exit -1;

}


sub parse_args {

my (%data,@files);


# Defaults

$data{opt}{index} = $DEFAULT_INDEX;

$data{opt}{default} = $DEFAULT_DEFAULT;

$data{opt}{out} = $DEFAULT_OUT;

$data{opt}{links_per_page} = $DEFAULT_LINKS_PER_PAGE;

$data{opt}{depth} = $DEFAULT_DEPTH;


while (my $arg=shift(@ARGV)) {

if ($arg =~ /^-h$/) { usage(); }

if ($arg =~ /^-d$/) { $MAIN::DEBUG=1; next; }

if ($arg =~ /^-ignore(=(.+))?$/) { $data{opt}{ignore}= $2 ? $3 : shift @ARGV; next; }

if ($arg =~ /^-index(=(.+))?$/) { $data{opt}{index}= $2 ? $3 : shift @ARGV; next; }

if ($arg =~ /^-default(=(.+))?$/) { $data{opt}{default}= $2 ? $3 : shift @ARGV; next; }

if ($arg =~ /^-out(=(.+))?$/) { $data{opt}{out}= $2 ? $3 : shift @ARGV; next; }

if ($arg =~ /^-lpp(=(.+))?$/) { $data{opt}{links_per_page}= $2 ? $3 : shift @ARGV; next; }

if ($arg =~ /^-depth(=(.+))?$/) { $data{opt}{depth}= $2 ? $3 : shift @ARGV; next; }

if ($arg =~ /^-no_?clean$/) { $data{opt}{noclean}=1; next; }

if ($arg =~ /^-./) { usage("Unknown option: $arg"); }

push(@files,$arg);

}

#usage("No input defined") unless $file;

push(@files,"-") unless @files;


map get_sites(\%data,$_), @files;


unshift(@{$data{top}}, "<p><font size='+1'><b>Main links:</font></b><p>\n")

if $data{top};


\%data;

}


#########################

# Build a site index

#########################

sub index_last { # Contents sorter, site_index goes last

my ($data,$a,$b) = @_;

return 1 if $a eq $data->{opt}{out};

return -1 if $b eq $data->{opt}{out};

return $a cmp $b;

}


sub get_title {

my ($data,$path,$file) = @_;


my $title = undef;

my $file = "$path/$file" if $path;

$file .= "/$data->{opt}{default}" if -d $file;


return $data->{titles}{$file} if $data->{titles}{$file};


return $title unless open(F,"<$file");

my ($in_title,$done_title) = (0,0);

while (<F>) {

$in_title = 1 if s/.*<title[^>]*>\s*//i;

$done_title = 1 if s/<\/title.*//i;

$title .= $_ if $in_title;

last if $done_title;

#if (m|<title>\s*(\S.*)\s*(</title>)?|) { close F; return $1; }

last if $.>30; # Only read first 30 lines

last if m|</head|;

}

close F;

chomp $title;

$title =~ s/[\n\r]/ /g;

$title =~ s/</&lt;/g;

$title =~ s/>/&gt;/g;

return $data->{titles}{$file}=$title;

}


sub get_name {

my ($data,$file) = @_;

my $name = $file;

$name =~ s/_/ /g; # Underbar = space

$name =~ s|$data->{opt}{index}||g; # .html

$name;

}


sub out_top_links {

my ($data,$dom,$fullpath) = @_;

$fullpath = $fullpath || $dom;

my ($path,$file) = ($fullpath =~ m|(.*)/([^/]+)$|) ? ($1, $2) : ("",$fullpath);

my $name = get_name($data,$file);

$dom = "http://$dom" unless $dom =~ m|://|;

out_link($dom, 1, $name, get_title($data,$path,$file));

}


sub out_link {

my ($url, $lvl, $name, $title) = @_;

my $out = "&nbsp;"x(4*$lvl);

$url =~ s/'/%27/g;

$out .= "<a href='$url'>$name</a>";

$out .= " $title" if $name ne $title;

$out .= "<br>\n";

$out;

}


sub domain_header {

my ($data, $domain) = @_;


# Find any domain aliases

my $path = $data->{domain}{$domain}{path};

my @domains = grep($path && $data->{domain}{$_}{path} eq $path, @{$data->{domains}});

push(@domains,$domain) unless $path;

my $str = join(" / ", map("<a href='http://$_/'>$_</a>", @domains));


"<p><font size='+1'><b>$str</b></font><p>\n";

}


sub no_index {

my ($data,$dir,$path) = @_;


# No index?

return 1 if -f "$path/$NO_INDEX";


# Skip it if it's the path of one of our subdomains, let that index it

foreach my $dom ( @{$data->{domains}} ) {

my $dom_path = $data->{domain}{$dom}{path};

next unless $dom_path;


# $dom is a subdomain of $dir (and not, for example, the other way around)

next unless $dom_path =~ /^$dir./;

# And this is in the path of the subdomain

next unless $path =~ /^$dom_path/;

return 1;

}

return 0;

}


sub get_directory {

my ($data, $domain, $dir, $path, $lvl) = @_;


return unless $dir;

return if $data->{opt}{ignore} && $path =~ /$data->{opt}{ignore}/;

return if $data->{opt}{depth} && $data->{opt}{depth} < $lvl;


my @links;


# If we're level 0, then put in the domain header

unless ($lvl) {

push(@links, domain_header($data,$domain));

$lvl++;

}


# Read the directory

opendir(DIR, $path) || fatal("Couldn't read directory [$path]\n");

my @dir = grep(-d "$path/$_" || /$data->{opt}{index}/ && !/$data->{opt}{default}/,

grep(!/^\.{1,2}$/, readdir(DIR)) );

@dir = grep( "$path/$_" !~ /$data->{opt}{ignore}/, @dir) if $data->{opt}{ignore};

closedir(DIR);


# Handle directories and html

return unless (@dir);

my $url_path = $path;

$url_path =~ s|^$dir|http://$domain|g;

foreach my $file ( sort { index_last($data,$a,$b) } @dir ) {

my $name = get_name($data,$file);


my $title = get_title($data,$path,$file);

my $url = "$url_path/$file";

unless (-d "$path/$file") {

push(@links, out_link($url, $lvl, $name, $title));

} elsif (!no_index($data,$dir,"$path/$file")) {

my @dir_links = get_directory($data, $domain, $dir, "$path/$file", $lvl+1)

unless (-f "$path/$file/$NO_CONTENTS" || $file eq $data->{opt}{out});

push(@links, out_link("$url/", $lvl, $name, $title), @dir_links)

if @dir_links || -f "$path/$file/$data->{opt}{default}";

}

}

@links;

}


#########################

# Output

#########################

sub page_index {

my ($page) = @_;

($page==1) ? "index.html" : "index.$page.html";

}


sub start_index {

my ($data, $out, $domain, $page, $pages) = @_;


my $path = $data->{domain}{$domain}{path};

my $file = "$path/$data->{opt}{out}/";

mkdir($file, 0755) unless -d $file;

$file .= page_index($page);

open($out,">$file")

|| fatal("Can't write site index [$file]\n");


print $out <<END_OF_HEADER;

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">

<html>

<head>

<title>

Hierarchy: $domain

</title>

<meta http-equiv="content-type" content="text/html;charset=ISO-8859-1">

</head>

<body bgcolor=white>

Hierarchy for local domains, generated by the <a href='http://MarginalHacks.com/'>MarginalHacks</a> tool <a href='http://MarginalHacks.com/Hacks/site_index/'>$PROGNAME</a><p>


<p><hr><p>

END_OF_HEADER


return $file unless $pages>1;

print $out " <table width='100%'><tr valign=top>\n";

print $out " <td width=60 bgcolor='#aaaaaa'>\n";

foreach my $p ( 1..$pages ) {

my $url = page_index($p);

print $out ($page==$p) ?

" Page $p<br>\n" :

" <a href='$url'>Page $p</a><br>\n";

}

print $out " </td>\n";

print $out " <td width='5'> </td>\n";

print $out " <td>\n";

$file;

}


sub end_index {

my ($data, $out, $file, $pages) = @_;


print $out <<END_PAGES_TABLE if $pages>1;

</td>

</tr></table>

END_PAGES_TABLE


my $date = localtime;

print $out <<END_OF_FOOTER;

<p><hr><p>

Generated on $date;

</body>

</html>

END_OF_FOOTER


close($out);

print "Wrote $file\n";

}


sub clean_index {

my ($data,$domain,$pages) = @_;

my $path = $data->{domain}{$domain}{path};

my $f;

while (++$pages && ($f="$path/$data->{opt}{out}/".page_index($pages)) && -f $f) {

unlink $f;

print "Removed: $f\n";

}

}


sub output {

my ($data, $domain, $show) = @_;


# How many pages?

my $lpp = $data->{opt}{links_per_page};

my $num = $#$show+1;

my $pages = POSIX::ceil($num/$lpp);

# The page links themselves count.. (and so do the two credit links)

while ($pages*$lpp < $num+($pages*($pages-1))+2) {

if ($lpp<$pages) { # Trouble

print STDERR "Not enough links-per-page, increasing\n";

$lpp*=1.25;

$pages=int($pages/1.25);

} else { $pages++; }

}


my $out = new IO::File;

my $link = 0;

for(my $p=1; $p<=$pages; $p++) {

my $cnt = $pages-1;

my $file = start_index($data, $out, $domain, $p, $pages);

for( ; $link<=$#$show && $cnt<$lpp; $link++) {

print $out $show->[$link];

$cnt++;

}

end_index($data, $out, $file, $pages);

}

clean_index($data, $domain, $pages) unless $data->{opt}{noclean};

}


##################################################

# Main code

##################################################


# Domain sorter

srand(time^$$);

sub domains {

my ($data,$a,$b) = @_;

my $a_num = split(/\./,$a); my $b_num = split(/\./,$b);


# No - subdomains will be how we compare same-importance domains

# # Subdomains go last

# return $a_num <=> $b_num unless $a_num==$b_num;


# Rate by importance for 2-5

my $a_imp = $data->{domain}{$a}{importance};

my $b_imp = $data->{domain}{$b}{importance};

$a_imp = 5 unless $a_imp;

$b_imp = 5 unless $b_imp;

$a_imp = 2 if $a_imp<2;

$b_imp = 2 if $b_imp<2;

return $a_imp <=> $b_imp unless $a_imp==$b_imp;


# Number of subdomains

return $a_num <=> $b_num unless $a_num==$b_num;


#$a cmp $b;


# Random otherwise

int(rand(2))*2-1;

}


sub main {

my $data = parse_args();


# Get the site index for each site

my %did;

foreach my $domain ( @{$data->{domains}} ) {

my $path = $data->{domain}{$domain}{path};

next unless $path;

next if $did{$path}++; # Skip if it's an alias


print "Fetching index info: $domain\n";

@{$data->{links}{$domain}} = get_directory($data, $domain, $path, $path, 0);

delete $data->{links}{$domain} unless @{$data->{links}{$domain}}

}


# Write the site index for each site

foreach my $domain ( @{$data->{domains}} ) {

my @show = @{$data->{top}};

next unless $data->{domain}{$domain}{path};

next unless $data->{links}{$domain};


push(@show, @{$data->{links}{$domain}});

foreach my $show_domain ( sort { domains($data,$a,$b); } @{$data->{domains}} ) {

my $show_importance = $data->{domain}{$show_domain}{importance};

next if $show_domain eq $domain;

next if $show_importance>=5; # Only on their own index

push(@show, domain_header($data,$show_domain))

if $show_importance==4 || !$data->{links}{$show_domain};

next if $show_importance==4;

push(@show, @{$data->{links}{$show_domain}}) if $data->{links}{$show_domain};

}


output($data, $domain, \@show);

}

} main();


0 Comments: