#!/usr/bin/perl -w
#
# Copyright (C) 2007, Joshua D. Abraham (jabra@spl0it.org)
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
# use strict;
#
# rip_links.pl - extracts links from a webpage
#
# ex: ./rip_links.pl -u http://www.digg.com
#
use strict;
use Getopt::Long;
use HTML::SimpleLinkExtor;
use vars qw( $PROG );
( $PROG = $0 ) =~ s/^.*[\/\\]//;    # Truncate calling path from the prog name
my $AUTH    = 'Joshua D. Abraham';  # author
my $EMAIL   = 'jabra@spl0it.org';   # email
my $VERSION = '1.00';               # version
my %options;                        # getopt option hash
my $concat;
my $scheme ;
my $extor = HTML::SimpleLinkExtor->new();

#
# help:
# display help information
#
sub help {
    print "Usage: $PROG [Input Option] [Option] 
    -u  --url               Parse url and extract links
    -f  --file              Parse file and extract links

        --all               Print all Links (Relative and Absolute)
    -a  --authority         Print only the scheme and authority(Only Absolute)      
    -c  --concat [string]   Concatenate string onto end of link(Only Absolute)
    -s  --scheme            Print the links of specific scheme(Only Absolute)
    -r  --relative          Print all the Relative Links
    
    -v  --version           Display version
    -h  --help              Display this information
Send Comments to $AUTH ( $EMAIL )\n";
    exit;
}

#
# print_version:
# displays version
#
sub print_version {
    print "$PROG version $VERSION by $AUTH ( $EMAIL )\n";
    exit;
}

#
# print_authority
# prints the scheme and authority for a URI
#
sub print_authority {
    my ($uri) = @_;
    my ($scheme, $authority, $path, $query, $fragment) = $uri =~ m/(?:([^:\/?#]+):)?(?:\/\/([^\/?\#]*))?([^?#]*)(?:\?([^#]*))?(?:\#(.*))?/;
    print $scheme . '://' . $authority . "\n";
}
if ( @ARGV == 0 ) {
    help;
}
GetOptions(
    \%options,
    'file|f=s', 'url|u=s', 'all','concat|c=s', 'authority|a','scheme|s=s','relative|r',
    'help|h'    => sub { help(); },
    'version|v' => sub { print_version(); },
) or exit 1;

if ( $options{file} and -r $options{file} ) {
    $extor->parse_file( $options{file} );
}
elsif ( $options{url} ) {
    $extor->parse_url( $options{url} );
}
else {
    print "Error: Input type not set\n";
    help();
}

my @links;
if ( $options{all} ) {
    @links = $extor->links();
}
elsif ( $options{relative} ){
    @links = $extor->relative_links;
}
elsif ( $options{scheme} ) {
    $options{scheme} =~ s/:\/\///g;
    @links = $extor->schemes( $scheme );
}
else {
    @links = $extor->schemes( qw( http https ) )
}

if ($options{authority}){
    foreach (@links){
        print_authority($_);
    }
}
elsif ( $options{concat} ) {
    $concat = $options{concat};
    foreach  (@links){
        print "$_$concat\n";
    }
}
else {
    foreach(@links){
        print "$_\n";
    }
}
