HEX

File: //usr/share/duck/DUCK.pm
# Copyright (C) 2016 Simon Kainz <skainz@debian.org>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# he Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# On Debian GNU/Linux systems, the complete text of the GNU General
# Public License can be found in `/usr/share/common-licenses/GPL-2'.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, you can find it on the World Wide
# Web at https://www.gnu.org/copyleft/gpl.html, or write to the Free
# Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston,
# MA 02110-1301, USA.



use strict;
use warnings;

package DUCK;
my $VERSION ='0.13.1';
my $COPYRIGHT_YEAR ='2017';

my $default_user_agent='Mozilla/5.0 (X11; Linux x86_64; rv:46.0) Gecko/20100101 Firefox/46.0';

use Regexp::Common qw /URI Email::Address/;
use String::Similarity;
use File::Which;
use WWW::Curl::Easy;
use strict;
use IPC::Open3;
use IO::Select;
use Net::DNS;
use Mail::Address;
use Domain::PublicSuffix;
use Config::Simple '-strict';

my $callbacks;

my $urlfixes={
TRAILING_PAREN_DOT=>1,
TRAILING_SLASH_DOT=>1,
TRAILING_SLASH_PAREN=>1,
TRAILING_COLON=>1,
TRAILING_PUNCTUATION=>1,
TRAILING_QUOTES=>1
};


my $self;
my $helpers={
    svn =>0,
    bzr =>0,
    git =>0,
    darcs =>1, # This works always as it uses WWW::Curl::Easy
    hg => 0,
    browser =>1 # This works always as we use WWW::Curl::Easy;
};

my $website_moved_regexs;
my $obsolete_sites;

my $cli_options;

my $tools=
{
    git => {
	cmd => 'git',
	args => ['ls-remote','%URL%']
    },
	    
    hg =>{
		cmd => 'hg',
		args => ['id','%URL%']
	},

    bzr => {
		cmd => 'bzr',
		args => ['-Ossl.cert_reqs=none','log','%URL%']
    },

    svn => {
	cmd => 'svn',
	args => ['--non-interactive','--trust-server-cert','info','%URL%']
}
	    
	     
};

sub version
{
    return $VERSION;
}

sub copyright_year
{
    return $COPYRIGHT_YEAR;
}

sub new {
    my $class = shift;
     $self = {};
     bless $self, $class;
    $self->__find_helpers();


    foreach (keys %$tools)
    {
	$tools->{$_}->{'args_count'}=scalar @{$tools->{$_}->{'args'}};
    }


    
    my $config_file=find_config_file();
    if($config_file)
    {
       my $nc=new Config::Simple($config_file);
       my $hashes=$nc->get_block("parked_domains");
       $website_moved_regexs=$hashes->{'regexes'};

       $hashes=$nc->get_block("obsolete_sites");
       
       if ($hashes)
       { if ($hashes->{'regexes'})
	 {
	     $obsolete_sites=$hashes->{'regexes'};
	 }
       }
    }


    
    return $self;
}

sub cb()
{
    $callbacks=
    {
	
	"Vcs-Browser" =>\&browser,
	"Vcs-Darcs" =>\&darcs,
	"Vcs-Git" =>\&git,
	"Vcs-Hg" =>\&hg,
	"Vcs-Svn" =>\&svn,
        "Vcs-Bzr" =>\&bzr,
	"Homepage" => \&browser,
	"URL" => \&browser,
	"Email" => \&email,
	"Maintainer" => \&maintainer,
	"Uploaders" => \&uploaders,
	"Try-HTTPS" => \&try_https,
        "SVN" => \&svn
	    
    };
    
    return $callbacks;
}

sub setOptions()
{
    shift;
    my ($ke,$va)=@_;
    $cli_options->{$ke}=$va;
}

sub getUrlFixes()
{
    return keys %{$urlfixes};
}

sub setUrlFixOptions()
{
    shift;
    my ($ke,$va)=@_;

    if (!exists $urlfixes->{$ke}) {
	print "Unknown urlfix parameter: $ke\nAvailable options are:\n\t".join("\n\t",getUrlFixes())."\n";
	exit 1;
    }
    
    $urlfixes->{$ke}=$va;

    
    
}

sub __find_helpers()
{

    $helpers->{git}=1 unless !defined (which('git'));
    $helpers->{svn}=1 unless !defined (which('svn'));
    $helpers->{hg}=1 unless !defined (which('hg'));
    $helpers->{bzr}=1 unless !defined (which('bzr'));
}

sub getHelpers()
{ return $helpers; }

sub git()
{
    my ($url)=@_;

    my @urlparts=split(/\s+/,$url);
    
    if ($tools->{'git'}->{'args_count'})
    {
    splice(@{$tools->{'git'}->{'args'}},$tools->{'git'}->{'args_count'});
    }


    if ($urlparts[1])
    {
	if ($urlparts[1] eq "-b" && $urlparts[2])
	{
	    push(@{$tools->{'git'}->{'args'}},'-b '.$urlparts[2]);
	}
    }
    return __run_helper('git',$urlparts[0]);
}

sub bzr()
{
    my ($url)=@_;
    return __run_helper('bzr',$url);
}


sub hg()
{
    my ($url)=@_;
    return __run_helper('hg',$url);
}

sub svn()
{
    my ($url)=@_;
	$ENV{SVN_SSH}='ssh -o BatchMode=yes';
    return __run_helper('svn',$url);
}

sub browser()
{

    my $enforce=1;

   my ($url)=@_;
    
    $url =~ s/\.*$//g;

    {
	
    return __run_browser($url);
    }
}

sub try_https_new($;$)
{
    my $similarity_th=0.9;
    my ($url,$erghttp)=@_;
    $url =~ s/\.*$//g;
  #  print STDOUT "tryhttps: $url, ergshttp:".ref($erghttp)."\n";

    my $res;
    #my $erghttp;
#    my $ergshttp= __run_browser($url,0);

#  print Dumper $ergshttp;  
 #   print "thttps:".ref($erghttp)."\n";
   # if (scalar($ergshttp))
   # {
#	$erghttp=@$ergshttp[0];
#	if (@$erghttp[0]->{'retval'} >0) {return $erghttp;}
 #   }
#    if ($erghttp->{'retval'} >0 ) {return $erghttp;}
    my $secure_url= $url; 
    $secure_url=~ s/http:/https:/g;

    my $ergshttps= __run_browser($secure_url);

    my $erghttps=@$ergshttps[0];
    if ($erghttps->{'retval'} >0 )
    {
	# error with https, so do not suggest switching to https, report only http check results
	return $erghttp;
    }

    # otherwise check similarity, and report if pages are (quite) the same 

    if ($erghttps && $erghttps->{'retval'} == 0 && $erghttp && $erghttp->{'body'} && $erghttp->{'finalscheme'} eq "http")
    {
	# https worked, now try to find out if pages match
#	print "#####################1\n".$erghttp->{'body'}."\n\n".$erghttps->{'body'}."\n\n";
	my $similarity= similarity $erghttp->{'body'}, $erghttps->{'body'};
	
	if ($similarity > $similarity_th)
	{
	    $res->{'retval'}=2;
	    $res->{'response'}="The web page at $url works, but is also available via $secure_url, please consider switching to HTTPS urls.";
	    return $res;
	
	}
	
    } else
    {
	# report nothing
	$res->{'retval'}=0;
	return $res;
       
    }
	




    $res->{'retval'}=0;
    $res->{'response'}="lolz";
    $res->{'url'}=$url;
    return $res;

}



sub try_https()
{
    my $similarity_th=0.9;
    my ($url)=@_;
    $url =~ s/\.*$//g;

    my $res;

    my $erghttp= __run_browser($url);

    if ($erghttp->{'retval'} >0 ) {return $erghttp;}
    my $secure_url= $url; 
    $secure_url=~ s/http:/https:/g;


    my $erghttps= __run_browser($secure_url);
    
    if ($erghttps->{'retval'} >0 )
    {
	# error with https, so do not suggest switching to https, report only http check results
	return $erghttp;
    }

    # otherwise check similarity, and report if pages are (quite) the same 

    if ($erghttps->{'retval'} == 0)
    {
	# https worked, now try to find out if pages match

	my $similarity= similarity $erghttp->{'body'}, $erghttps->{'body'};


	if ($similarity > $similarity_th)
	{
	    $res->{'retval'}=2;
	    $res->{'response'}="The web page at $url works, but is also available via $secure_url, please consider switching to HTTPS urls.";
	    return $res;
	
	}
	
    } else
    {
	# report nothing
	$res->{'retval'}=0;
	return $res;
       
    }
	




    $res->{'retval'}=0;
    $res->{'response'}="lolz";
    $res->{'url'}=$url;
    return $res;

}

sub darcs()
{
    my ($url)=@_;
    my $darcsurltemp=$url;
    $darcsurltemp =~ s/\/$//;
    $darcsurltemp.='/_darcs/hashed_inventory';
    return __run_browser($darcsurltemp);
}




sub uploaders()
{
    my ($line_uploaders)=@_;
    $line_uploaders =~ s/\n/ /g;
    my @emails;

    if ($line_uploaders =~ /@/)
    {
	@emails=Mail::Address->parse($line_uploaders);
    }
    my $res;

    foreach my $email(@emails)
    {
	my $es=$email->address();
	my $ra=check_domain($es);
	my $r=@$ra[0];
	if ($r->{retval}>0)
	{
	    if (!$res->{retval})
	    {
		$res=$r;
	    } else
	    {
		$res->{retval}=$r->{retval};
		$res->{response}.="\n".$r->{response};
		$res->{url}="foo";
	    }
	    
	}
	
    }
    
    if (!$res->{retval})
    {
	$res->{'retval'}=0;
	$res->{'response'}="";
	$res->{'url'}=$line_uploaders;
    }
    return $res;

}

sub maintainer()
{
    my ($email)=@_;
     return check_domain($email);
}



sub email()
{
    my ($email) =@_;
    return check_domain($email);
}


sub __run_browser {


    my $certainty;
    my @SSLs=(CURL_SSLVERSION_DEFAULT,
      CURL_SSLVERSION_TLSv1,
      CURL_SSLVERSION_SSLv2,
      CURL_SSLVERSION_SSLv3,
      CURL_SSLVERSION_TLSv1_0,
      CURL_SSLVERSION_TLSv1_1,
      CURL_SSLVERSION_TLSv1_2);

    my ($url,$return_ref)=@_;

    my $user_agent=$default_user_agent;

    if (! ( $cli_options->{'no-https'}))
    {
	$cli_options->{'no-https'}=0;
    }

       if (! ( $cli_options->{'no-check-certificate'}))
    {
	$cli_options->{'no-check-certificate'}=0;
    }


    #check if URL is mailto: link
    
    if ($url =~/mailto:\s*.+@.+/)
    {
    return check_domain($url);
    }
    
    my $curl = WWW::Curl::Easy->new;

    my @website_moved_whitelist=('anonscm.debian.org.*duck.git');
    
    $curl->setopt(CURLOPT_HEADER,0);
    if ($cli_options->{'no-check-certificate'} eq 1) {
    $curl->setopt(CURLOPT_SSL_VERIFYPEER,0);
    $curl->setopt(CURLOPT_SSL_VERIFYHOST,0);
    }
    $curl->setopt(CURLOPT_CERTINFO,0);
    $curl->setopt(CURLOPT_FOLLOWLOCATION,1);
    $curl->setopt(CURLOPT_SSL_CIPHER_LIST,'ALL');
    $curl->setopt(CURLOPT_MAXREDIRS,10);     
    $curl->setopt(CURLOPT_TIMEOUT,60);
    $curl->setopt(CURLOPT_USERAGENT,$user_agent);
    $curl->setopt(CURLOPT_URL, $url);

    #validate against ca-certificates

#    $curl->setopt(CURLOPT_SSL_VERIFYPEER,1);
#    $curl->setopt(CURLOPT_SSL_VERIFYHOST,1);
#    $curl->setopt(CURLOPT_CAPATH,'');

    my $response_body;
    my $response_code;
    my $retcode;
    my $response;
    my $startscheme;
    my $finalscheme;
    my $startdomain;
    my $finaldomain;
    my $startdomainsuffix;
    my $finaldomainsuffix;
    
      if ($url =~ /($RE{URI}{HTTP}{-keep}{-scheme =>'https?'})/ )
    {
	$startdomain=$4;
	$startscheme=$3;
    }

    foreach my $s (@SSLs)
    {
    $curl->setopt(CURLOPT_WRITEDATA,\$response_body);
    $curl->setopt(CURLOPT_SSLVERSION,$s);
    # Starts the actual request
    $retcode = $curl->perform;
    $response_code = $curl->getinfo(CURLINFO_HTTP_CODE);
    $response=$curl->strerror($retcode)." ".$curl->errbuf."\n";

  if ($curl->getinfo(CURLINFO_EFFECTIVE_URL) =~ /($RE{URI}{HTTP}{-keep}{-scheme =>'https?'})/ )
    {
	$finaldomain=$4;
	$finalscheme=$3;
    }

    $startdomainsuffix = Domain::PublicSuffix->new({'data_file' => '/usr/share/publicsuffix/effective_tld_names.dat'});
    $finaldomainsuffix = Domain::PublicSuffix->new({'data_file' => '/usr/share/publicsuffix/effective_tld_names.dat'});

    
    if ($retcode == 35) { next;}
    if ($retcode == 56) {next;}
    last;
    }

    # Looking at the results...
    my $status=0;
    my $disp=0;
    my $is_obsolete=0;

 
    if ($retcode == 0) # no curl error, but maybe a http error
    {
	#default to error
	$status=1;
	$disp=1;

	#handle ok cases, 200 is ok for sure
	if ($response_code ==200 )
	{
	    $status=0;
	    $disp=0;
	}


	if ($response_code ==226 )
	{
	    $status=0;
	    $disp=0;
	}

	if ($response_code ==227 )
	{
	    $status=0;
	    $disp=0;
	}

	if ($response_code ==302 ) #temporary redirect is ok
	{
	    $status=0;
	    $disp=0;
	}

	if ($response_code ==403)
	{
	    ## special case for sourceforge.net sites
	    ## sourceforge seems to always return correct pages with HTTP code 403.
	    
	    if ( $url =~ m/(sourceforge|sf).net/i)
	    {
		$status=0;
		$disp=0;
	    }


	}
	my $whitelisted=0;
	foreach my $whitelist_url (@website_moved_whitelist)
	{
	    if ( $url =~ m/$whitelist_url/i)
	
	    {$whitelisted=1;}

	}
	foreach my $obsolete_site (@$obsolete_sites)
	{
	    if ($url =~ m/$obsolete_site/i)
	    {
		$is_obsolete=1;
	    }
	}

	if ($whitelisted == 0 && $response_body)
	{
	    $response_body=strip_html_comments($response_body);
	    
	    foreach my $regex (@$website_moved_regexs)
	      {
		  if ($response_body =~ m/\b$regex\b/i )
		  {
		      $disp=2;
		      $response.="Website seems to be outdated, is probably a parked domain or for sale. Please update your links!\nMatching regular expression: m/\\b".$regex."\\b/i";
		      $certainty="wild-guess";
		      last;
		  }
	      }
	  }
	
    }
    else {  # we have a curl error, so we show this entry for sure
	$status=1;
	$disp=1;
    }


    my $ret;
    $ret->{'retval'}=$disp;
    $ret->{'response'}="Curl:$retcode HTTP:$response_code $response";
    $ret->{'url'}=$url;
    $ret->{'body'}=$response_body;
    $ret->{'certainty'}=$certainty;

    $ret->{'startscheme'}=$startscheme;
    $ret->{'startdomain'}=$startdomain;
    $ret->{'finalscheme'}=$finalscheme;
    $ret->{'finaldomain'}=$finaldomain;
    
    my @reta;
    push(@reta,$ret);
    if ($startscheme && $finalscheme)
    {	
      my $rd_startdomainsuffix=$startdomainsuffix->get_root_domain($startdomain);
      my $rd_finaldomainsuffix=$startdomainsuffix->get_root_domain($finaldomain);

      if ($rd_startdomainsuffix && $rd_finaldomainsuffix && ($rd_startdomainsuffix ne $rd_finaldomainsuffix))
      {
	
	my $ret_dom;
	$ret_dom->{'retval'}=2;
	$ret_dom->{'response'}="Domain redirect detected: ${startscheme}://".$startdomain." -> ${finalscheme}://".$finaldomain.". Probably a new upstream website?";
	$ret_dom->{'url'}=$url;
	$ret_dom->{'certainty'}=$certainty;

	$ret_dom->{'startscheme'}=$startscheme;
	$ret_dom->{'startdomain'}=$startdomain;
	$ret_dom->{'finalscheme'}=$finalscheme;
	$ret_dom->{'finaldomain'}=$finaldomain;

	
	push (@reta,$ret_dom);
    } else 
    {

	if ($is_obsolete==1)
	{
	    my $ret_obsolete;
	    $ret_obsolete->{'retval'}=2;
	    $ret_obsolete->{'response'}="The website/URL is known to be obsolete. Please update your links.";
	    $ret_obsolete->{'url'}=$url;
	    $ret_obsolete->{'certainty'}="wild-guess";
	    push (@reta,$ret_obsolete);
	}

	if ($startscheme eq "https" and $finalscheme eq "http")
	{
	    
	    my $ret_schema;
	    $ret_schema->{'retval'}=2;
	    $ret_schema->{'response'}="Secure URL redirects to an insecure URL: ${startscheme}://$startdomain -> ${finalscheme}://${finaldomain}";
	    $ret_schema->{'url'}=$url;
	    #    $ret->{'body'}=$;
	    $ret_schema->{'certainty'}=$certainty;

	    $ret_schema->{'startscheme'}=$startscheme;
	    $ret_schema->{'startdomain'}=$startdomain;
	    $ret_schema->{'finalscheme'}=$finalscheme;
	    $ret_schema->{'finaldomain'}=$finaldomain;

	    
	    push (@reta,$ret_schema);
	    
	}

	if ($startscheme eq "http" and $finalscheme eq "https")
	{
	    
	    my $ret_schema;
	    $ret_schema->{'retval'}=2;
	    $ret_schema->{'response'}="URL schema changed from HTTP to HTTPS during redirect(s): ${startscheme}://$startdomain -> ${finalscheme}://${finaldomain}\nPlease investigate and update the URL eventually, to avoid unnecessary redirects!";
	    $ret_schema->{'url'}=$url;
	    $ret_schema->{'certainty'}=$certainty;

	    $ret_schema->{'startscheme'}=$startscheme;
	    $ret_schema->{'startdomain'}=$startdomain;
	    $ret_schema->{'finalscheme'}=$finalscheme;
	    $ret_schema->{'finaldomain'}=$finaldomain;

	    
	    push (@reta,$ret_schema);
	    
	}
    }
    }

    if ( ($ret->{'retval'} ==0) && defined($startscheme) && (!($startscheme eq "https")))
    {
    if ( ($cli_options->{'no-https'}==0) && (!($url =~ m/https:\/\//i )) )
    {
	my $https=try_https_new($url,$ret);
	if ($https->{'retval'} > 0 )
	{
	push(@reta,$https); 
	}
    }
    }
  

    
    
    return \@reta; 
}


sub find_config_file()
{
    (my $conffilename)=@_;

    if (!defined $conffilename)
    {
	$conffilename="duck/duck.conf";
    }
    my @config_dirs;
    if (!$ENV{'XDG_CONFIG_DIRS'})
    {
	push(@config_dirs,"/etc/xdg");
    } else
    {
	push(@config_dirs,split(/:/,$ENV{'XDG_CONFIG_DIRS'}));
    }


    push (@config_dirs,"/etc");
    if (!$ENV{'XDG_CONFIG_HOME'})
    {
	push(@config_dirs,$ENV{'HOME'}."/.config");
    } else
    {
	push(@config_dirs,$ENV{'XDG_CONFIG_HOME'});
    }

    foreach my $cdir (reverse @config_dirs)
    {
	my $fp=$cdir.'/'.$conffilename;
	if ( -r $fp) {return $fp;}
    }

    return 0;
}




sub __run_helper {
    
    my ($tool,$url)=@_;
   return undef unless $helpers->{$tool} == 1;
   return undef unless defined $tools->{$tool};

   my @args=@{$tools->{$tool}->{'args'}};

   for(@args){s/\%URL\%/$url/g}

    my $pid;
    my $command;
    my $timeout;
    my @reta;

    if ($cli_options->{'timeout'})
    {

	my $timeout_value=60;
	if ( ( $cli_options->{'timeout_seconds'} ))
	    {
		$timeout_value=$cli_options->{'timeout_seconds'};
		$timeout_value =~ s/[^0-9]//;
	    }
	unshift @args,$tools->{$tool}->{'cmd'};
	unshift @args,$timeout_value."s";
	$command="/usr/bin/timeout";
	$pid=open3(\*WRITE,\*READ,0,$command,@args);
    
    }
    else
    {
    $pid=open3(\*WRITE,\*READ,0,$tools->{$tool}->{'cmd'},@args);

    }

   my @results = <READ>;
   waitpid ($pid,0);
   close READ;

   my $retval=$?;
   my $ret;
   $ret->{'retval'}=$retval;
   $ret->{'response'}=join("",@results);
    $ret->{'url'}=$url;

    push(@reta,$ret);
   return \@reta;
}

sub check_domain($)
		 {
		     my $res = Net::DNS::Resolver->new;
		     my ($email) = @_;
		     my @emails=Mail::Address->parse($email);
		     $email=$emails[0]->address();
#		     $email=$email->address();
		     my @domain = ( $email =~ m/^[^@]*@([^?^&^>]*).*/);

		     my @queries=('MX','A','AAAA');
		     my @results;
		     my $iserror=1;
		     foreach my $query (@queries)
		     {
			 my $q=$res->query($domain[0],$query);
			 
			 if ($q)
			 {
			     my @answers=$q->answer;
			     my $mxcount=scalar @answers;
			     push (@results,$mxcount." ".$query." entries found.");
			     $iserror=0;
			     last;
			 } else
			 {
			     push (@results,"$email: No ".$query." entry found.");
			 }
			 
		     }
		     
		     
		     my $ret;

		     my @reta;
		     $ret->{'retval'}=$iserror;
		     $ret->{'response'}=join("\n",@results);
		     $ret->{'url'}=$email;

		     push(@reta,$ret);
		     return \@reta;
		     
		     
		 }

sub strip_html_comments()
{

    my ($html)=@_;
    my $pid=open3(\*WRITE,\*READ,\*ERR,'lynx -cfg=/dev/null -dump -width=2048 -stdin') or die("lynx not found!");
    print WRITE $html;
    close WRITE;
    my @cleaned=<READ>;
    waitpid($pid,0);
    close(READ);
    return join("",@cleaned);

}



sub extract_url($) {
    my $url;
    my ($b,$l)=@_;

    if ( $l =~ /\(\s*($RE{URI}{-keep}{-scheme =>'https?'})\s*\)/ ||
         $l =~ /($RE{URI}{HTTP}{-keep}{-scheme =>'https?'})/ ||
	 $l =~ /\(\s*($RE{URI}{HTTP}{-keep}{-scheme =>'ftp'})\s*\)/ ||
	 $l =~ /($RE{URI}{HTTP}{-keep}{-scheme =>'ftp'})/
	 
)
    {
	#ok, we have a url here, now clean it up:
	
	my $url=$1;

	if ($urlfixes->{TRAILING_PAREN_DOT}) {
	    $url =~ s/\)\.\s*$//g;
	}

	if ($urlfixes->{TRAILING_SLASH_DOT}) {
	    $url =~ s/\/\.\s*$/\//g;
	}

	if ($urlfixes->{TRAILING_SLASH_PAREN}) {
	    $url =~ s/\/\)\s*$/\//g;
	}

	if ($urlfixes->{TRAILING_COLON}) {
	    $url =~ s/:\s*$//g;
	}

	if ($urlfixes->{TRAILING_PUNCTUATION}) {
	    $url =~ s/([^\/])[\.,]\s*$/$1/g;
	}

	if ($urlfixes->{TRAILING_QUOTES}) {
	    $url =~ s/'\s*$//g;
	}

	return $url;
    }
    
    
    
    
		};



1;