#!/usr/bin/perl
use strict;
use warnings;
$|++;

my $VERSION = '0.04';

#----------------------------------------------------------------------------

=head1 NAME

xhtml-valid - test web page DTD validation.

=head1 SYNOPSIS

  xhtml-valid \\
         [-i|ignore file] \\
         [-u|url url] [--ulist file] \\
         [-p|path path] [-f|file file] [-flist file] \\
         [-h|help] [-v|version]

=head1 DESCRIPTION

Using either URLs or flat files, this program attempts to validate web pages
according to their own DTD.

=cut

# -------------------------------------
# Library Modules

use Getopt::Long;
use Test::XHTML::Valid;

# -------------------------------------
# Variables

my %options;

my @IGNORE = (
    qr/^mailto/,
    qr/\.(xml|txt|pdf)$/i,
    qr/\.(tar\.gz|zip)$/i,
    qr/\.(mp4|avi|wmv)$/i,
    qr/\.(jpg|bmp|gif|png)$/i,
);

# -------------------------------------
# Program

##### INITIALISE #####

init_options();

##### MAIN #####

my $txv = Test::XHTML::Valid->new();
$txv->ignore(@IGNORE);

# dynamic pages
if($options{url}) {
    $txv->process_pages($options{url});

} elsif($options{ulist}) {
    $txv->process_url_list($options{ulist});


# static pages
} elsif($options{flist}) {
    $txv->process_file_list($options{flist});

} elsif($options{file}) {
    $txv->process_file($options{flist});

} elsif($options{path}) {
    $txv->process_path($options{path});


# oops!
} else {
    help(1);
}

$txv->process_retries();
my $results = $txv->process_results();

printf "%5s: %s\n", $_, $results->{$_}  for(qw(FILES PASS FAIL NET));

# -------------------------------------
# Subroutines

sub init_options {
    GetOptions( \%options,
        'path|p=s',
        'file|f=s',
        'flist=s',
        'url|u=s',
        'ulist=s',
        'ignore|i=s',
        'help|h',
        'version|v'
    );

    _help(1)    if($options{help});
    _help(0)    if($options{version});
    if(defined $options{path} && ! -d $options{path}) {
        print "ERROR: path not found - $options{path}\n";
        _help(1);
    }
    for my $file ('file','flist','ulist') {
        if(defined $options{$file} && ! -f $options{$file}) {
            print "ERROR: file used in option '$file' not found [$options{$file}]\n";
            _help(1);
        }
    }

    if(defined $options{ignore} && ! -f $options{ignore}) {
        my $fh = IO::File->new($options{ignore},'r') or die "Cannot read file [options{ignore}]: $!\n";
        while(<$fh>) {
            chomp;
            push @IGNORE, qr!$_!;
        }
    }
}

sub _help {
    my $full = shift;

    if($full) {
        print <<HERE;

Usage: $0 [-h] [-v] \\
         [-i file]
         [-u url] [--ulist file] \\
         [-p path] [-f file] [-flist file]


  -i file       patterns used to ignore URLs (e.g. user login)

  -u url        root target URL for validating
  -ulist file   file containing a list of target URLs

  -p path       target directory of XHTML files
  -f file       single target XHTML file path
  -flist file   file containing a list of XHTML file paths

  -h            this help screen
  -v            program version

  Note: The --url|u option acts as a crawler, testing any URL links found in
        the page that match a URL that would be below the given root URL. As
        such external links to the site and links that would be a parent of
        the given root are not tested.

        The --ulist option will only test the web links listed, and will NOT
        crawl any links within the page.

HERE

    }

    print "$0 v$VERSION\n\n";
    exit(0);
}

__END__

=head1 USAGE

This program can be used in several ways to validate web pages. It will take a
root URL and crawl the website from the root and validate every page it finds
below it, it can test named URLs only. Given a root local directory it will
traverse the directory tree and validate every HTML file it finds, it will test
a single file or a list of files. In short it tries to validate web pages.

=head2 URL Options

=over

=item * -u|url url

Given a root URL will traverse the website, validating all pages found that
are below the root URL. Thus external links and those outside of the root URL
are ignored.

=item * --ulist file

The given file should contain a list of URLs (one per line), which will then be
validated. Note that only the links listed are validated, no crawling of the
links within the page is performed.

=back

=head2 File Options

=over

=item * -p|path path

Given a root directory will traverse the directory tree and validate every
.html or .htm file it finds.

=item * -f|file file

Validates a single file.

=item * -flist file

The given file should contain a list of files (one per line), which will then
be validated.

=back

=head2 Supporting Options

=over

=item * -i|ignore file

The given file should contain patterns (one per line) used to ignore URLs and
files (e.g. user login) from validation.

By default mailto links and various document and binary file formats are
ignored, together with any none 'http' protocol.

=back

=head2 Other Options

=over

=item * -h|help

Provides a help screen.

=item * -v|version

Provides the current program version

=back

=head1 BUGS, PATCHES & FIXES

There are no known bugs at the time of this release. However, if you spot a
bug or are experiencing difficulties, that is not explained within the POD
documentation, please send bug reports and patches to barbie@cpan.org.

Fixes are dependant upon their severity and my availablity. Should a fix not
be forthcoming, please feel free to (politely) remind me.

=head1 SEE ALSO

L<XML::LibXML>

=head1 AUTHOR

  Barbie, <barbie@cpan.org>
  for Miss Barbell Productions <http://www.missbarbell.co.uk>.

=head1 COPYRIGHT AND LICENSE

  Copyright (C) 2008-2011 Barbie for Miss Barbell Productions.

  This module is free software; you can redistribute it and/or
  modify it under the Artistic Licence v2.

=cut

