#!/usr/local/bin/perl -w

########################################################################################################################
# anaml.pl - converts Analog's HTML-2 logfile reports into XML, mapping each report onto a HTML-style table            #
# Version 1.0: 23.1.99 15:39 - 25.1.99                                                                                 #
# chocolateboy@usa.net - mail me with bugfixes, requests, marriage proposals &c.                                       #
#														       				     #
# Do anything you want with this script, but please keep this header, and let me know if you use it for anything cool  #
########################################################################################################################

use strict;

#########################################################################################################################
#                                                                                                                       #
# You'll need to set the following three variables to get things working - the rest can be adjusted according to taste  #
#                                                                                                                       #
#########################################################################################################################

my $SOURCE="/export/htdocs/myreport.html"; # HTML Analog report to be processed
my $OUTDIR="/export/htdocs/anaml"; # Full path of xml output directory (can be a new directory if you have permissions)
my $LOCALE="/usr/local/analog/lang/uk.lng"; # Make sure you specify the html language file ie *h.lng (if available)

#########################################################################################################################

my $GENERATOR="Generator"; # Localize this (Analog Version/Platform)
my $HOSTNAME="Hostname"; # And this (Server Name)
my $META="Meta Report"; # And this (Analog  Uber Report)
my $TABLE="REPORT"; # Fiddle with these to roll your own tags
my $TR="RR"; # Row delimiter
my $TH="RH"; # Column heading
my $TD="RD"; # Report element
my $NAME="NAME"; # Report name
my $SUMMARY="SUMMARY"; # Some reports have one line summaries (busiest month &c.)
# my $DEBUG=" BORDER=1"; # Use this to place additional (e.g formatting/debug) tags in the top level ($TABLE) tag
my $DEBUG=""; # Make sure $DEBUG is defined (even if it's just "")
#########################################################################################################################

open (SOURCE,"<$SOURCE") or die ("Can't open $SOURCE: $!"); # Open the file to be xmlised
open (LOCALE,"<$LOCALE") or die ("Can't open $LOCALE: $!"); # And the language file (needed to correctly parse reports)
undef ($/);
my $slurp=(<SOURCE>);
$/="\n";

my $index=0;
my (@lingo);

while (<LOCALE>) { # Load the language file into a list for easy access (skip comments)
    next if /^#/;
    chop;
    $lingo[$index++]=$_;
}

my %names=( # This hash maps verbose report names onto their short version - used to name the output file
    $lingo[61]=>'general', # General Summary
    $lingo[62]=>'monthly', # Monthly Report
    $lingo[64]=>'weekly', # Weekly Report
    $lingo[67]=>'fulldaily', # Daily Report
    $lingo[66]=>'daily', # Daily Summary
    $lingo[69]=>'fullhourly', # Hourly Report
    $lingo[70]=>'hourly', # Hourly Summary
    $lingo[72]=>'quarter', # Quarter-Hour Report
    $lingo[74]=>'five', # Five-Minute Report
    $lingo[76]=>'host', # Host Report
    $lingo[136]=>'domain', # Domain Report
    $lingo[88]=>'request', # Request Report
    $lingo[80]=>'directory', # Directory Report
    $lingo[84]=>'filetype', # File Type Report
    $lingo[144]=>'size', # File Size Report
    $lingo[92]=>'redir', # Redirection Report
    $lingo[96]=>'failure', # Failure Report
    $lingo[100]=>'referrer', # Referrer Report
    $lingo[104]=>'refsite', # Referring Site Report
    $lingo[108]=>'redirref', # Redirected Referrer Report
    $lingo[112]=>'failref', # Failed Referrer Report
    $lingo[132]=>'fullbrowser', # Browser Report
    $lingo[128]=>'browser', # Browser Summary
    $lingo[116]=>'vhost', # Virtual Host Report
    $lingo[120]=>'user', # User Report
    $lingo[124]=>'failuser', # Failed User Report
    $lingo[140]=>'status', # Status Code Report
    $META=>'meta' # Meta Report
    );

# Set up the hash containing the xml to be exported
my %exports;

# Now grab a couple of pertinent fields
my ($timestamp,$from,$to,$days,$generator,$runtime,$general)=(@lingo[156..158],$lingo[23],@lingo[145..146],$lingo[61]);
my @reports=split(/<hr>/i,$slurp);

my $report=$reports[0]; # The stuff at the top before the proper reports
my ($title,$hostname,$host,$stats)=($report=~/^.+?<title>(.+?)<\/title>.+?(<a href=[^>]+>([^<]+)<\/a>)<\/h1>(.+)$/is);
my $xml="<$TABLE $NAME=\"$META\" $SUMMARY=\"$title\"$DEBUG>\n";
$xml.="<$TR><$TH>$HOSTNAME</$TH><$TH>$timestamp</$TH><$TH>$from</$TH><$TH>$to</$TH><$TH>$days</$TH><$TH>$GENERATOR</$TH><$TH>$runtime</$TH></$TR>\n";

$stats.=$reports[-1]; # The stuff at the bottom - this and the stats at the top make up the Meta Report
$stats=~s/<\/i>.+$//s; # Remove the tail
$stats=~s/\n+//g; # Death to newlines
$stats=~s/ +/ /g; # Double spaces -> single spaces
($timestamp,$from,$to,$days,$generator,$runtime)=
($stats=~/^$timestamp (.+?)\.<br>$from (.+?) $to (.+?) \((.+?) $days\)\.<i>$generator (<a href.+?<\/a>).+?$runtime:<\/b> (.+?)\.$/i);
$xml.=
"<$TR><$TD>$hostname</$TD><$TD>$timestamp</$TD><$TD>$from</$TD><$TD>$to</$TD><$TD>$days</$TD><$TD>$generator</$TD><$TD>$runtime</$TD></$TR>\n";
$xml.="</$TABLE>";
$exports{$names{$META}}=$xml;

REPORT: for ($index=1;$index<$#reports;$index++) {
    $report=$reports[$index];
    my ($name)=($report=~m/<h2><.*?>(.+)<.*?><\/h2>/i);
    my ($dope,$lastseven);

    if ($name=~/$general/) { # General summary needs special treatment
	if (($dope)=($report=~/<p><b>(.+)$/si)) { # No summary for the last seven days
        $lastseven="";
	} else {
	    ($lastseven,$dope)=($report=~/<p>\((?!<b>)(.+?)\)\.\n<br><b>(.+)$/is); # Extract the summary and the stats
	    $lastseven=" $SUMMARY=\"$lastseven\.\""; # Format the L7 summary
	}

	$dope=~s/\n|://g; # Kill newlines and colons
	$dope=~s/(<\/b>)( +)/$1/ig;
	my (%summaries)=split(/(?:<br>)?<\/?b>/ig,$dope);
	my ($key);
	my $row="<$TR>";
	$xml="<$TABLE $NAME=\"$general\"$lastseven$DEBUG>\n<$TR>";
	foreach $key (keys %summaries) {
	    $xml.="<$TH>$key</$TH>";
	    $row.="<$TD>$summaries{$key}</$TD>";
		}
	$xml.="</$TR>\n";
	$row.="</$TR>\n</$TABLE>";
	$xml.=$row;
	$exports{$names{$name}}=$xml;
	next REPORT;
    }

    my ($meat,$summary)=($report=~/<pre><tt>(.+)<\/tt><\/pre>(.*)$/is); # Extract the meat and the summary (if present)
    $summary=~s/\n//g; # Strip newlines from summary
    $meat=~s/^\s+//g; # Strip opening spaces
    $meat=~s/: +<img .+//ig; # Lose the graphics
    $meat=~s/(?:: +)?\n+ */\n/gs; # Clean out double newlines and orphaned delimiters
    $meat=~s/ +/ /; # Squash multiple spaces
    my ($head,$body)=($meat=~/^([^\n]+)\n[^\n]+\n(.+?)\n$/s);
    $head=~s/: +/<\/$TH><$TH>/g;
    $head="<$TR><$TH>$head<\/$TH><\/$TR>";
    $body=~s/\n/<\/$TD><\/$TR>\n<$TR><$TD>/g;
    $body=~s/: +/<\/$TD><$TD>/g;
    $body=~s/(\[.+?)<\/$TD><$TD>(.+?\])/$1: $2/gi; # Colons inside square brackets can be left alone
    $body="<$TR><$TD>$body<\/$TD><\/$TR>";
    $summary=$summary ? " $SUMMARY=\"$summary\"" : "";
    $xml="<$TABLE $NAME=\"$name\"$summary$DEBUG>\n$head\n$body\n<\/$TABLE>";
    $exports{$names{$name}}=$xml;
}

if (!(-e $OUTDIR)) { # If it doesn't exist, make it
    mkdir ($OUTDIR,0755) or die ("Can't create $OUTDIR: $!");
}

if (!(-w $OUTDIR)) {
    die "Can't write to output directory: $!"; # If it does exist, make sure it's writable
}

foreach $report (keys %exports) { # Let's go
    open (XML,">$OUTDIR/$report.xml");
    print XML $exports{$report};
    close XML;
}

close SOURCE;
close LOCALE;
exit (0);