File: //usr/share/duck/DUCK.pm
# Copyright (C) 2016 Simon Kainz <skainz@debian.org>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# he Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# On Debian GNU/Linux systems, the complete text of the GNU General
# Public License can be found in `/usr/share/common-licenses/GPL-2'.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, you can find it on the World Wide
# Web at https://www.gnu.org/copyleft/gpl.html, or write to the Free
# Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston,
# MA 02110-1301, USA.
use strict;
use warnings;
package DUCK;
my $VERSION ='0.13.1';
my $COPYRIGHT_YEAR ='2017';
my $default_user_agent='Mozilla/5.0 (X11; Linux x86_64; rv:46.0) Gecko/20100101 Firefox/46.0';
use Regexp::Common qw /URI Email::Address/;
use String::Similarity;
use File::Which;
use WWW::Curl::Easy;
use strict;
use IPC::Open3;
use IO::Select;
use Net::DNS;
use Mail::Address;
use Domain::PublicSuffix;
use Config::Simple '-strict';
my $callbacks;
my $urlfixes={
TRAILING_PAREN_DOT=>1,
TRAILING_SLASH_DOT=>1,
TRAILING_SLASH_PAREN=>1,
TRAILING_COLON=>1,
TRAILING_PUNCTUATION=>1,
TRAILING_QUOTES=>1
};
my $self;
my $helpers={
svn =>0,
bzr =>0,
git =>0,
darcs =>1, # This works always as it uses WWW::Curl::Easy
hg => 0,
browser =>1 # This works always as we use WWW::Curl::Easy;
};
my $website_moved_regexs;
my $obsolete_sites;
my $cli_options;
my $tools=
{
git => {
cmd => 'git',
args => ['ls-remote','%URL%']
},
hg =>{
cmd => 'hg',
args => ['id','%URL%']
},
bzr => {
cmd => 'bzr',
args => ['-Ossl.cert_reqs=none','log','%URL%']
},
svn => {
cmd => 'svn',
args => ['--non-interactive','--trust-server-cert','info','%URL%']
}
};
sub version
{
return $VERSION;
}
sub copyright_year
{
return $COPYRIGHT_YEAR;
}
sub new {
my $class = shift;
$self = {};
bless $self, $class;
$self->__find_helpers();
foreach (keys %$tools)
{
$tools->{$_}->{'args_count'}=scalar @{$tools->{$_}->{'args'}};
}
my $config_file=find_config_file();
if($config_file)
{
my $nc=new Config::Simple($config_file);
my $hashes=$nc->get_block("parked_domains");
$website_moved_regexs=$hashes->{'regexes'};
$hashes=$nc->get_block("obsolete_sites");
if ($hashes)
{ if ($hashes->{'regexes'})
{
$obsolete_sites=$hashes->{'regexes'};
}
}
}
return $self;
}
sub cb()
{
$callbacks=
{
"Vcs-Browser" =>\&browser,
"Vcs-Darcs" =>\&darcs,
"Vcs-Git" =>\&git,
"Vcs-Hg" =>\&hg,
"Vcs-Svn" =>\&svn,
"Vcs-Bzr" =>\&bzr,
"Homepage" => \&browser,
"URL" => \&browser,
"Email" => \&email,
"Maintainer" => \&maintainer,
"Uploaders" => \&uploaders,
"Try-HTTPS" => \&try_https,
"SVN" => \&svn
};
return $callbacks;
}
sub setOptions()
{
shift;
my ($ke,$va)=@_;
$cli_options->{$ke}=$va;
}
sub getUrlFixes()
{
return keys %{$urlfixes};
}
sub setUrlFixOptions()
{
shift;
my ($ke,$va)=@_;
if (!exists $urlfixes->{$ke}) {
print "Unknown urlfix parameter: $ke\nAvailable options are:\n\t".join("\n\t",getUrlFixes())."\n";
exit 1;
}
$urlfixes->{$ke}=$va;
}
sub __find_helpers()
{
$helpers->{git}=1 unless !defined (which('git'));
$helpers->{svn}=1 unless !defined (which('svn'));
$helpers->{hg}=1 unless !defined (which('hg'));
$helpers->{bzr}=1 unless !defined (which('bzr'));
}
sub getHelpers()
{ return $helpers; }
sub git()
{
my ($url)=@_;
my @urlparts=split(/\s+/,$url);
if ($tools->{'git'}->{'args_count'})
{
splice(@{$tools->{'git'}->{'args'}},$tools->{'git'}->{'args_count'});
}
if ($urlparts[1])
{
if ($urlparts[1] eq "-b" && $urlparts[2])
{
push(@{$tools->{'git'}->{'args'}},'-b '.$urlparts[2]);
}
}
return __run_helper('git',$urlparts[0]);
}
sub bzr()
{
my ($url)=@_;
return __run_helper('bzr',$url);
}
sub hg()
{
my ($url)=@_;
return __run_helper('hg',$url);
}
sub svn()
{
my ($url)=@_;
$ENV{SVN_SSH}='ssh -o BatchMode=yes';
return __run_helper('svn',$url);
}
sub browser()
{
my $enforce=1;
my ($url)=@_;
$url =~ s/\.*$//g;
{
return __run_browser($url);
}
}
sub try_https_new($;$)
{
my $similarity_th=0.9;
my ($url,$erghttp)=@_;
$url =~ s/\.*$//g;
# print STDOUT "tryhttps: $url, ergshttp:".ref($erghttp)."\n";
my $res;
#my $erghttp;
# my $ergshttp= __run_browser($url,0);
# print Dumper $ergshttp;
# print "thttps:".ref($erghttp)."\n";
# if (scalar($ergshttp))
# {
# $erghttp=@$ergshttp[0];
# if (@$erghttp[0]->{'retval'} >0) {return $erghttp;}
# }
# if ($erghttp->{'retval'} >0 ) {return $erghttp;}
my $secure_url= $url;
$secure_url=~ s/http:/https:/g;
my $ergshttps= __run_browser($secure_url);
my $erghttps=@$ergshttps[0];
if ($erghttps->{'retval'} >0 )
{
# error with https, so do not suggest switching to https, report only http check results
return $erghttp;
}
# otherwise check similarity, and report if pages are (quite) the same
if ($erghttps && $erghttps->{'retval'} == 0 && $erghttp && $erghttp->{'body'} && $erghttp->{'finalscheme'} eq "http")
{
# https worked, now try to find out if pages match
# print "#####################1\n".$erghttp->{'body'}."\n\n".$erghttps->{'body'}."\n\n";
my $similarity= similarity $erghttp->{'body'}, $erghttps->{'body'};
if ($similarity > $similarity_th)
{
$res->{'retval'}=2;
$res->{'response'}="The web page at $url works, but is also available via $secure_url, please consider switching to HTTPS urls.";
return $res;
}
} else
{
# report nothing
$res->{'retval'}=0;
return $res;
}
$res->{'retval'}=0;
$res->{'response'}="lolz";
$res->{'url'}=$url;
return $res;
}
sub try_https()
{
my $similarity_th=0.9;
my ($url)=@_;
$url =~ s/\.*$//g;
my $res;
my $erghttp= __run_browser($url);
if ($erghttp->{'retval'} >0 ) {return $erghttp;}
my $secure_url= $url;
$secure_url=~ s/http:/https:/g;
my $erghttps= __run_browser($secure_url);
if ($erghttps->{'retval'} >0 )
{
# error with https, so do not suggest switching to https, report only http check results
return $erghttp;
}
# otherwise check similarity, and report if pages are (quite) the same
if ($erghttps->{'retval'} == 0)
{
# https worked, now try to find out if pages match
my $similarity= similarity $erghttp->{'body'}, $erghttps->{'body'};
if ($similarity > $similarity_th)
{
$res->{'retval'}=2;
$res->{'response'}="The web page at $url works, but is also available via $secure_url, please consider switching to HTTPS urls.";
return $res;
}
} else
{
# report nothing
$res->{'retval'}=0;
return $res;
}
$res->{'retval'}=0;
$res->{'response'}="lolz";
$res->{'url'}=$url;
return $res;
}
sub darcs()
{
my ($url)=@_;
my $darcsurltemp=$url;
$darcsurltemp =~ s/\/$//;
$darcsurltemp.='/_darcs/hashed_inventory';
return __run_browser($darcsurltemp);
}
sub uploaders()
{
my ($line_uploaders)=@_;
$line_uploaders =~ s/\n/ /g;
my @emails;
if ($line_uploaders =~ /@/)
{
@emails=Mail::Address->parse($line_uploaders);
}
my $res;
foreach my $email(@emails)
{
my $es=$email->address();
my $ra=check_domain($es);
my $r=@$ra[0];
if ($r->{retval}>0)
{
if (!$res->{retval})
{
$res=$r;
} else
{
$res->{retval}=$r->{retval};
$res->{response}.="\n".$r->{response};
$res->{url}="foo";
}
}
}
if (!$res->{retval})
{
$res->{'retval'}=0;
$res->{'response'}="";
$res->{'url'}=$line_uploaders;
}
return $res;
}
sub maintainer()
{
my ($email)=@_;
return check_domain($email);
}
sub email()
{
my ($email) =@_;
return check_domain($email);
}
sub __run_browser {
my $certainty;
my @SSLs=(CURL_SSLVERSION_DEFAULT,
CURL_SSLVERSION_TLSv1,
CURL_SSLVERSION_SSLv2,
CURL_SSLVERSION_SSLv3,
CURL_SSLVERSION_TLSv1_0,
CURL_SSLVERSION_TLSv1_1,
CURL_SSLVERSION_TLSv1_2);
my ($url,$return_ref)=@_;
my $user_agent=$default_user_agent;
if (! ( $cli_options->{'no-https'}))
{
$cli_options->{'no-https'}=0;
}
if (! ( $cli_options->{'no-check-certificate'}))
{
$cli_options->{'no-check-certificate'}=0;
}
#check if URL is mailto: link
if ($url =~/mailto:\s*.+@.+/)
{
return check_domain($url);
}
my $curl = WWW::Curl::Easy->new;
my @website_moved_whitelist=('anonscm.debian.org.*duck.git');
$curl->setopt(CURLOPT_HEADER,0);
if ($cli_options->{'no-check-certificate'} eq 1) {
$curl->setopt(CURLOPT_SSL_VERIFYPEER,0);
$curl->setopt(CURLOPT_SSL_VERIFYHOST,0);
}
$curl->setopt(CURLOPT_CERTINFO,0);
$curl->setopt(CURLOPT_FOLLOWLOCATION,1);
$curl->setopt(CURLOPT_SSL_CIPHER_LIST,'ALL');
$curl->setopt(CURLOPT_MAXREDIRS,10);
$curl->setopt(CURLOPT_TIMEOUT,60);
$curl->setopt(CURLOPT_USERAGENT,$user_agent);
$curl->setopt(CURLOPT_URL, $url);
#validate against ca-certificates
# $curl->setopt(CURLOPT_SSL_VERIFYPEER,1);
# $curl->setopt(CURLOPT_SSL_VERIFYHOST,1);
# $curl->setopt(CURLOPT_CAPATH,'');
my $response_body;
my $response_code;
my $retcode;
my $response;
my $startscheme;
my $finalscheme;
my $startdomain;
my $finaldomain;
my $startdomainsuffix;
my $finaldomainsuffix;
if ($url =~ /($RE{URI}{HTTP}{-keep}{-scheme =>'https?'})/ )
{
$startdomain=$4;
$startscheme=$3;
}
foreach my $s (@SSLs)
{
$curl->setopt(CURLOPT_WRITEDATA,\$response_body);
$curl->setopt(CURLOPT_SSLVERSION,$s);
# Starts the actual request
$retcode = $curl->perform;
$response_code = $curl->getinfo(CURLINFO_HTTP_CODE);
$response=$curl->strerror($retcode)." ".$curl->errbuf."\n";
if ($curl->getinfo(CURLINFO_EFFECTIVE_URL) =~ /($RE{URI}{HTTP}{-keep}{-scheme =>'https?'})/ )
{
$finaldomain=$4;
$finalscheme=$3;
}
$startdomainsuffix = Domain::PublicSuffix->new({'data_file' => '/usr/share/publicsuffix/effective_tld_names.dat'});
$finaldomainsuffix = Domain::PublicSuffix->new({'data_file' => '/usr/share/publicsuffix/effective_tld_names.dat'});
if ($retcode == 35) { next;}
if ($retcode == 56) {next;}
last;
}
# Looking at the results...
my $status=0;
my $disp=0;
my $is_obsolete=0;
if ($retcode == 0) # no curl error, but maybe a http error
{
#default to error
$status=1;
$disp=1;
#handle ok cases, 200 is ok for sure
if ($response_code ==200 )
{
$status=0;
$disp=0;
}
if ($response_code ==226 )
{
$status=0;
$disp=0;
}
if ($response_code ==227 )
{
$status=0;
$disp=0;
}
if ($response_code ==302 ) #temporary redirect is ok
{
$status=0;
$disp=0;
}
if ($response_code ==403)
{
## special case for sourceforge.net sites
## sourceforge seems to always return correct pages with HTTP code 403.
if ( $url =~ m/(sourceforge|sf).net/i)
{
$status=0;
$disp=0;
}
}
my $whitelisted=0;
foreach my $whitelist_url (@website_moved_whitelist)
{
if ( $url =~ m/$whitelist_url/i)
{$whitelisted=1;}
}
foreach my $obsolete_site (@$obsolete_sites)
{
if ($url =~ m/$obsolete_site/i)
{
$is_obsolete=1;
}
}
if ($whitelisted == 0 && $response_body)
{
$response_body=strip_html_comments($response_body);
foreach my $regex (@$website_moved_regexs)
{
if ($response_body =~ m/\b$regex\b/i )
{
$disp=2;
$response.="Website seems to be outdated, is probably a parked domain or for sale. Please update your links!\nMatching regular expression: m/\\b".$regex."\\b/i";
$certainty="wild-guess";
last;
}
}
}
}
else { # we have a curl error, so we show this entry for sure
$status=1;
$disp=1;
}
my $ret;
$ret->{'retval'}=$disp;
$ret->{'response'}="Curl:$retcode HTTP:$response_code $response";
$ret->{'url'}=$url;
$ret->{'body'}=$response_body;
$ret->{'certainty'}=$certainty;
$ret->{'startscheme'}=$startscheme;
$ret->{'startdomain'}=$startdomain;
$ret->{'finalscheme'}=$finalscheme;
$ret->{'finaldomain'}=$finaldomain;
my @reta;
push(@reta,$ret);
if ($startscheme && $finalscheme)
{
my $rd_startdomainsuffix=$startdomainsuffix->get_root_domain($startdomain);
my $rd_finaldomainsuffix=$startdomainsuffix->get_root_domain($finaldomain);
if ($rd_startdomainsuffix && $rd_finaldomainsuffix && ($rd_startdomainsuffix ne $rd_finaldomainsuffix))
{
my $ret_dom;
$ret_dom->{'retval'}=2;
$ret_dom->{'response'}="Domain redirect detected: ${startscheme}://".$startdomain." -> ${finalscheme}://".$finaldomain.". Probably a new upstream website?";
$ret_dom->{'url'}=$url;
$ret_dom->{'certainty'}=$certainty;
$ret_dom->{'startscheme'}=$startscheme;
$ret_dom->{'startdomain'}=$startdomain;
$ret_dom->{'finalscheme'}=$finalscheme;
$ret_dom->{'finaldomain'}=$finaldomain;
push (@reta,$ret_dom);
} else
{
if ($is_obsolete==1)
{
my $ret_obsolete;
$ret_obsolete->{'retval'}=2;
$ret_obsolete->{'response'}="The website/URL is known to be obsolete. Please update your links.";
$ret_obsolete->{'url'}=$url;
$ret_obsolete->{'certainty'}="wild-guess";
push (@reta,$ret_obsolete);
}
if ($startscheme eq "https" and $finalscheme eq "http")
{
my $ret_schema;
$ret_schema->{'retval'}=2;
$ret_schema->{'response'}="Secure URL redirects to an insecure URL: ${startscheme}://$startdomain -> ${finalscheme}://${finaldomain}";
$ret_schema->{'url'}=$url;
# $ret->{'body'}=$;
$ret_schema->{'certainty'}=$certainty;
$ret_schema->{'startscheme'}=$startscheme;
$ret_schema->{'startdomain'}=$startdomain;
$ret_schema->{'finalscheme'}=$finalscheme;
$ret_schema->{'finaldomain'}=$finaldomain;
push (@reta,$ret_schema);
}
if ($startscheme eq "http" and $finalscheme eq "https")
{
my $ret_schema;
$ret_schema->{'retval'}=2;
$ret_schema->{'response'}="URL schema changed from HTTP to HTTPS during redirect(s): ${startscheme}://$startdomain -> ${finalscheme}://${finaldomain}\nPlease investigate and update the URL eventually, to avoid unnecessary redirects!";
$ret_schema->{'url'}=$url;
$ret_schema->{'certainty'}=$certainty;
$ret_schema->{'startscheme'}=$startscheme;
$ret_schema->{'startdomain'}=$startdomain;
$ret_schema->{'finalscheme'}=$finalscheme;
$ret_schema->{'finaldomain'}=$finaldomain;
push (@reta,$ret_schema);
}
}
}
if ( ($ret->{'retval'} ==0) && defined($startscheme) && (!($startscheme eq "https")))
{
if ( ($cli_options->{'no-https'}==0) && (!($url =~ m/https:\/\//i )) )
{
my $https=try_https_new($url,$ret);
if ($https->{'retval'} > 0 )
{
push(@reta,$https);
}
}
}
return \@reta;
}
sub find_config_file()
{
(my $conffilename)=@_;
if (!defined $conffilename)
{
$conffilename="duck/duck.conf";
}
my @config_dirs;
if (!$ENV{'XDG_CONFIG_DIRS'})
{
push(@config_dirs,"/etc/xdg");
} else
{
push(@config_dirs,split(/:/,$ENV{'XDG_CONFIG_DIRS'}));
}
push (@config_dirs,"/etc");
if (!$ENV{'XDG_CONFIG_HOME'})
{
push(@config_dirs,$ENV{'HOME'}."/.config");
} else
{
push(@config_dirs,$ENV{'XDG_CONFIG_HOME'});
}
foreach my $cdir (reverse @config_dirs)
{
my $fp=$cdir.'/'.$conffilename;
if ( -r $fp) {return $fp;}
}
return 0;
}
sub __run_helper {
my ($tool,$url)=@_;
return undef unless $helpers->{$tool} == 1;
return undef unless defined $tools->{$tool};
my @args=@{$tools->{$tool}->{'args'}};
for(@args){s/\%URL\%/$url/g}
my $pid;
my $command;
my $timeout;
my @reta;
if ($cli_options->{'timeout'})
{
my $timeout_value=60;
if ( ( $cli_options->{'timeout_seconds'} ))
{
$timeout_value=$cli_options->{'timeout_seconds'};
$timeout_value =~ s/[^0-9]//;
}
unshift @args,$tools->{$tool}->{'cmd'};
unshift @args,$timeout_value."s";
$command="/usr/bin/timeout";
$pid=open3(\*WRITE,\*READ,0,$command,@args);
}
else
{
$pid=open3(\*WRITE,\*READ,0,$tools->{$tool}->{'cmd'},@args);
}
my @results = <READ>;
waitpid ($pid,0);
close READ;
my $retval=$?;
my $ret;
$ret->{'retval'}=$retval;
$ret->{'response'}=join("",@results);
$ret->{'url'}=$url;
push(@reta,$ret);
return \@reta;
}
sub check_domain($)
{
my $res = Net::DNS::Resolver->new;
my ($email) = @_;
my @emails=Mail::Address->parse($email);
$email=$emails[0]->address();
# $email=$email->address();
my @domain = ( $email =~ m/^[^@]*@([^?^&^>]*).*/);
my @queries=('MX','A','AAAA');
my @results;
my $iserror=1;
foreach my $query (@queries)
{
my $q=$res->query($domain[0],$query);
if ($q)
{
my @answers=$q->answer;
my $mxcount=scalar @answers;
push (@results,$mxcount." ".$query." entries found.");
$iserror=0;
last;
} else
{
push (@results,"$email: No ".$query." entry found.");
}
}
my $ret;
my @reta;
$ret->{'retval'}=$iserror;
$ret->{'response'}=join("\n",@results);
$ret->{'url'}=$email;
push(@reta,$ret);
return \@reta;
}
sub strip_html_comments()
{
my ($html)=@_;
my $pid=open3(\*WRITE,\*READ,\*ERR,'lynx -cfg=/dev/null -dump -width=2048 -stdin') or die("lynx not found!");
print WRITE $html;
close WRITE;
my @cleaned=<READ>;
waitpid($pid,0);
close(READ);
return join("",@cleaned);
}
sub extract_url($) {
my $url;
my ($b,$l)=@_;
if ( $l =~ /\(\s*($RE{URI}{-keep}{-scheme =>'https?'})\s*\)/ ||
$l =~ /($RE{URI}{HTTP}{-keep}{-scheme =>'https?'})/ ||
$l =~ /\(\s*($RE{URI}{HTTP}{-keep}{-scheme =>'ftp'})\s*\)/ ||
$l =~ /($RE{URI}{HTTP}{-keep}{-scheme =>'ftp'})/
)
{
#ok, we have a url here, now clean it up:
my $url=$1;
if ($urlfixes->{TRAILING_PAREN_DOT}) {
$url =~ s/\)\.\s*$//g;
}
if ($urlfixes->{TRAILING_SLASH_DOT}) {
$url =~ s/\/\.\s*$/\//g;
}
if ($urlfixes->{TRAILING_SLASH_PAREN}) {
$url =~ s/\/\)\s*$/\//g;
}
if ($urlfixes->{TRAILING_COLON}) {
$url =~ s/:\s*$//g;
}
if ($urlfixes->{TRAILING_PUNCTUATION}) {
$url =~ s/([^\/])[\.,]\s*$/$1/g;
}
if ($urlfixes->{TRAILING_QUOTES}) {
$url =~ s/'\s*$//g;
}
return $url;
}
};
1;