File: //usr/share/cvs/contrib/validate_repo
#! /usr/bin/perl -w
########################################################################
#
#  Copyright (c) 2000, 2001 by Donald Sharp <sharpd@cisco.com>
#  All Rights Reserved
#
#  Some portions Copyright (c) 2002, 2003 by
#                Derek R. Price <mailto:derek@ximbiot.com>
#                & Ximbiot <http://ximbiot.com>.
#  All rights reserved.
#
#  Permission is granted to copy and/or distribute this file, with or
#  without modifications, provided this notice is preserved.
#
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either version 2, or (at your option)
#  any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
########################################################################
=head1 validate_repo.pl
Script to check the integrity of the Repository.
=head1 SYNOPSIS
    perldoc validate_repo.pl
    validate_repo.pl --help [--verbose!]
    validate_repo.pl [--verbose!] [--cvsroot=CVSROOT] [--exec=SCRIPT]...
                     [--all-revisions!] [module]...
=head1 DESCRIPTION
This script will search through a repository and determine if any of the
files in it are corrupted.
This is normally accomplished by checking out all I<important> revisions, where
I<important> revisions are defined as the smallest set which, when checked out,
will cause each and every revision's integrity to be verified.  This resolves
to the most recent revision on each branch and the first and last revisions on
the trunk.
Please do not run this script inside of the repository itself.  This will cause
it too fail.
=head1 OPTIONS
=over
=item C<--help>
Print this very help text (or, with C<--verbose>, act like
C<perldoc validate_repo.pl>).
=item C<-a> or C<--all-revisions>
Check out each and every revision rather than just the I<important> ones.
This flag is useful with C<--exec> to execute the C<SCRIPT> (from C<--exec>
below) on a checked out copy of each and every revision.
=item C<-d> or C<--cvsroot=CVSROOT>
Use repository specified by C<CVSROOT>.  Defaults to the contents of the
F<./CVS/Root> file when it exists and is readable, then to the contents of the
C<$CVSROOT> environment variable when it is set and non-empty.
=item C<-e> or C<--exec=SCRIPT>
Execute (as from command prompt) C<SCRIPT> if it exists as a file, is readable,
and is executable, or evaluate (as a perl script) C<SCRIPT> for a checked out
copy of each I<important> revision of each RCS archive in CVSROOT.  Executed
scripts are passed C<CVSROOT FILE REVISION FNO>, where C<CVSROOT> is what
you'd think, C<FILE> is the path to the file relative to C<CVSROOT> and
suitable for use as an argument to C<cvs co>, C<cvs rlog>, and so on,
C<REVISION> is the revision of the checked out file, and C<FNO> is the file
number of the open, read-only file descriptor containing the checked out
contents of revision C<REVISION> of C<FILE>.  An evaluated C<SCRIPT> will find
the same four arguments in the same order in C<@_>, except that C<FNO> will be
an open file handle.
With C<--all-revisions>, execute or evaluate C<SCRIPT> for a checked out
version of each revsion in the RCS archive.
=item C<-v> or C<--verbose>
Print verbose debugging information (or, when specified with C<--help>, act
like C<perldoc validate_repo.pl>).
=head1 ARGUMENTS
=over
=item C<modules>
The module in the repository to examine.  Defaults to the contents of the
F<./CVS/Repository> file when it exists and is readable, then to F<.>
(all modules).
=head1 EXAMPLES
    setenv CVSROOT /release/111/cvs
    validate_repo.pl
    validate_repo.pl -d /another/cvsroot --verbose --exec '
    system "grep \"This string means Im a bad, bad file!\" <&"
           . fileno( $_[3] )
           . ">/dev/null"
        or die "Revision $_[2] of $_[0]/$_[1],v is bad, bad, bad!"'
=head1 SEE ALSO
None.
=cut
######################################################################
#                    MODULES                                         #
######################################################################
use strict;
use Fcntl qw( F_GETFD F_SETFD );
use File::Find;
use File::Basename;
use File::Path;
use File::Spec;
use Getopt::Long;
use IO::File;
use Pod::Usage;
######################################################################
#                    GLOBALS                                         #
######################################################################
use vars qw(
             $all_revisions
             $cvsroot
             @extra_files
             @ignore_files
             $ignored_files
             @invalid_revs
             @list_of_broken_files
             @scripts
             $total_files
             $total_interesting_revisions
             $total_revisions
             $verbose
           );
######################################################################
#                    SUBROUTINES                                     #
######################################################################
######################################################################
#
#    NAME :
#      main
#
#    PURPOSE :
#      To search the repository for broken files
#
#    PARAMETERS :
#      NONE
#
#    GLOBALS :
#      $cvsroot              - The CVS repository to search through.
#      $ENV{ CVSROOT }       - The default CVS repository to search through.
#      @list_of_broken_files - The list of files that need to
#                              be fixed.
#      $verbose              - is verbose mode on?
#      @scripts              - scripts to run on checked out files.
#      $total_revisions      - The number of revisions considered
#      $total_interesting_revisions - The number of revisions used
#      $total_files          - The total number of files looked at.
#
#    RETURNS :
#      A list of broken files
#
#    COMMENTS :
#      Do not run this script inside the repository.  Choose
#      a nice safe spot( like /tmp ) outside of the repository.
#
######################################################################
sub main
{
	my $help;
	$ignored_files = 0;
	$total_files = 0;
	$total_interesting_revisions = 0;
	$total_revisions = 0;
	Getopt::Long::Configure( "bundling" );
	unless( GetOptions(
	                    'all-revisions|a!' => \$all_revisions,
	                    'cvsroot|d=s' => \$cvsroot,
	                    'exec|e=s' => \@scripts,
	                    'help|h|?!' => \$help,
	                    'verbose|v!' => \$verbose
 	                  )
	      )
	{
		pod2usage( 2 );
		exit 2;
	}
	pod2usage( -exitval => 2,
	           -verbose => $verbose ? 2 : 1,
	           -output => \*STDOUT )
		if $help;
	verbose( "Verbose Mode Turned On\n" );
	if( !$cvsroot && -f "CVS/Root" && -r "CVS/Root" )
	{
		my $file = new IO::File "< CVS/Root";
		$cvsroot = $file->getline;
		chomp $cvsroot;
	}
	$cvsroot = $ENV{'CVSROOT'} unless $cvsroot;
	pod2usage( "error: Must set CVSROOT" ) unless $cvsroot;
	if( $cvsroot =~ /^:\w+:/ && $cvsroot !~ /^:local:/
	    || $cvsroot =~ /@/ )
	{
		print STDERR "CVSROOT must be :local:\n";
		exit 2;
	}
	for (@scripts)
	{
		$_ = File::Spec->rel2abs( $_ ) unless /\n/ || !-x $_;
	}
	if( !scalar( @ARGV ) && -f "CVS/Repository" && -r "CVS/Repository" )
	{
		my $file = new IO::File "< CVS/Repository";
		my $module = $file->getline;
		chomp $module;
		push @ARGV, $module;
	}
	push @ARGV, "." unless( scalar @ARGV );
	foreach my $directory_to_look_at ( @ARGV )
	{
		$directory_to_look_at = File::Spec->catfile( $cvsroot,
		                                             $directory_to_look_at );
		my $sym_count = 0;
		while( -l $directory_to_look_at )
		{
			$directory_to_look_at = readlink( $directory_to_look_at );
			$sym_count += 1;
			die( "Encountered too many symlinks for CVSROOT ($cvsroot)\n" )
				if( $sym_count > 5 );
		}
		# Remove indirections.
		$directory_to_look_at =~ s#(/+.)*$##o;
		verbose( "Processing: $directory_to_look_at\n" );
		@ignore_files = get_ignore_files_from_cvsroot( $directory_to_look_at );
		find( \&process_file, $directory_to_look_at );
	}
	print "List of corrupted files\n" if @list_of_broken_files;
	foreach my $broken ( @list_of_broken_files )
	{
		print( "**** File: $broken\n" );
	}
	print "List of Files containing invalid revisions:\n"
		if @invalid_revs;
	foreach ( @invalid_revs )
	{
		print( "**** File: ($_->{'rev'}) $_->{'file'}\n" );
	}
	print "List of Files That Don't belong in Repository:\n"
		if @extra_files;
	foreach my $extra ( @extra_files )
	{
		print( "**** File: $extra\n" );
	}
	print( "Total Files: $total_files  Corrupted files: "
		   . scalar( @list_of_broken_files )
		   . "  Invalid revs: "
		   . scalar( @invalid_revs )
		   . "  Extra files: "
		   . scalar( @extra_files )
		   . "  Ignored Files: $ignored_files\n" );
	print( "Total Revisions: $total_revisions  Interesting Revisions: $total_interesting_revisions\n" );
}
sub verbose
{
	print STDERR @_ if $verbose;
}
######################################################################
#
#    NAME :
#      process_file
#
#    PURPOSE :
#      This function is called by the find function, its purpose
#      is to decide if it is important to look at a file or not.  When
#      a file is important, we log it or call &look_at_cvs_file on it.
#
#    ALGORITHM
#      1) If the file is an archive file, we call &look_at_cvs_file on
#         it.
#      2) Else, if the file is not in the ignore list, we store its name
#         for later.
#
#    PARAMETERS :
#      NONE
#
#    GLOBALS :
#      $cvsroot               - The CVS repository to search through
#      @ignore_files          - File patterns we can afford to ignore.
#      $File::Find::name      - The absolute path of the file being examined.
#
#    RETURNS :
#      NONE
#
#    COMMENTS :
#      NONE
#
######################################################################
sub process_file
{
    if( ! -d $File::Find::name )
	{
		my $path = $File::Find::name;
		$path =~ s#^$cvsroot/(\./)*##;
		$total_files++;
		verbose( "Examining `$path'\n" );
		if( $path =~ s/,v$// )
		{
			look_at_cvs_file( $path );
		}
		elsif( !grep { $path =~ $_ } @ignore_files )
		{
			push @extra_files, $path;
			verbose( "Adding unrecognized file `$path' to corrupted list.\n" );
		}
		else
		{
			$ignored_files++;
			verbose( "Ignoring `$path'\n" );
		}
	}
}
######################################################################
#
#    NAME :
#      look_at_cvs_file
#
#    PURPOSE :
#      To decide if a file is broken or not.  The algorithm is:
#      a)  Get the revision history for the file.
#              - If that fails the file is broken, save the fact
#                and continue processing other files.
#              - If that succeeds we have a list of revisions.
#      b)  For each revision call &check_revision on the file.
#              - If that fails the file is broken, save the fact
#                and continue processing other files.
#      c)  Continue on 
#
#    PARAMETERS :
#      $file - The path of the file to look at, relative to $cvsroot and
#              suitable for use as an argument to `cvs co', `cvs rlog', and
#              the rest of CVS's r* commands.
#
#    GLOBALS :
#      NONE
#
#    RETURNS :
#      NONE
#
#    COMMENTS :
#      We have to handle Attic files in a special manner.
#      Basically remove the Attic from the string if it
#      exists at the end of the $path variable.
#
######################################################################
sub look_at_cvs_file
{
    my( $file ) = @_;
    my( $name, $path ) = fileparse( $file );
    $file = $path . $name if $path =~ s#Attic/$##;
    my( $finfo, $rinfo ) = get_history( $file );
    unless( defined $rinfo )
    {
        verbose( "\t`$file' is corrupted.  It was determined to contain no\n"
		         . "\trevisions via a cvs rlog command\n" );
        push( @list_of_broken_files, $file );
        return();
    }
    my @int_revisions =
		$all_revisions ? keys %$rinfo
    	               : find_interesting_revisions( keys %$rinfo );
    foreach my $revision ( @int_revisions )
    {
        verbose( "\t\tLooking at Revision: $revision\n" );
        if( !check_revision( $file, $revision, $finfo, $rinfo ) )
        {
            verbose( "\t$file is corrupted in revision: $revision\n" );
            push( @list_of_broken_files, $file );
            return();
        }
    }
}
######################################################################
#
#    NAME :
#      get_history
#
#    PURPOSE :
#      To retrieve an array of revision numbers.
#
#    PARAMETERS :
#      $file - The file to retrieve the revision numbers for
#
#    GLOBALS :
#      $cvsroot - the CVSROOT we are examining
#
#    RETURNS :
#      On Success - A hash of revision info, indexed by revision numbers.
#      On Failure - undef.
#
#    COMMENTS :
#      The $_ is saved off because The File::find functionality
#      expects the $_ to not have been changed.
#      The -N option for the rlog command means to spit out 
#      tags or branch names.
#
######################################################################
sub get_history
{
	my( $file ) = @_;
	$file =~ s/(["\$`\\])/\\$1/g;
	my %finfo;		# Info about the file.
	my %rinfo;		# Info about revisions in the file.
	my $revision;
    my $fh = new IO::File( "cvs -d $cvsroot rlog -N \"$file\""
                           . ($verbose ? "" : " 2>&1") . " |" )
		or die( "unable to run `cvs rlog', help" );
	my $ignore = -1;
    while( my $line = $fh->getline )
    {
		if( $ignore == 1 ) 
		{
			if( ( $revision ) = $line =~ /^revision (.*?)(\tlocked by: \S+;)?$/ )
			{
  				unless($revision =~ m/^\d+\.\d+(?:\.\d+\.\d+)*$/)
				{
					push @invalid_revs, { 'file' => $file, 'rev' => $revision };
					verbose( "Adding invalid revision `$revision' of file `$file' to invalid revs list.\n" );
				}
				$ignore++;
				next;
			}
			# We require ---- before a ^revision tag, not a revision
			# after every ----.
			$ignore = 0;
        }
		if( $ignore == 2 )
		{
		    if( my ( $date, $author, $state ) =
		             $line =~ /^date: (\S+ \S+);  author: ([^;]+);  state: (\S+);/ )
			{
				$rinfo{$revision} =
				{
					'date' => $date,
					'author' => $author,
					'state' => $state
				}
			}
			else
			{
				die "Couldn't read date/author/state for revision $revision\n"
				    . "of $file from `cvs rlog'.\n"
				    . "line = $line";
			}
			$ignore = 0;
			next;
		}
		if( $ignore == -1 )
		{
			# Until we find the first ---- below, we can read general file info
		    if( my ( $kwmode ) =
		             $line =~ /^keyword substitution: (\S+)$/ )
			{
				$finfo{'kwmode'} = $kwmode;
				next;
			}
		}
		# rlog outputs a "----" line before the actual revision
		# without this we'll pick up peoples comments if they 
		# happen to start with revision
		if( $line =~ /^----------------------------$/ )
		{
			# Catch this case when $ignore == -1 or 0
			$ignore = 1;
			next;
		}
    }
	if( $verbose )
	{
		for (keys %rinfo)
		{
			verbose( "Revision $_: " );
			verbose( join( ", ", %{$rinfo{$_}} ) );
			verbose( "\n" );
		}
	}
	die "Syserr closing pipe from `cvs co': $!"
		if !$fh->close && $!;
	return if $?;
    return( \%finfo, %rinfo ? \%rinfo : undef );
}
######################################################################
#
#    NAME :
#      check_revision
#
#    PURPOSE :
#      Given a file and a revision number ensure that we can check out that
#      file.
#
#      If the user has specified any scripts (passed in as arguments to --exec
#      and stored in @scripts), run them on the checked out revision.  If
#      executable scripts exit with a non-zero status or evaluated scripts set
#      $@ (die), print $status or $@ as a warning.
#
#    PARAMETERS :
#      $file     - The file to look at.
#      $revision - The revision to look at.
#      $rinfo    - A reference to a hash containing information about the
#                  revisions in $file.
#                  For instance, $rinfo->{$revision}->{'date'} contains the
#                  date revision $revision was committed.
#
#    GLOBALS :
#      NONE
#
#    RETURNS :
#      If we can get the File - 1
#      If we can not get the File - 0
#
#    COMMENTS :
#      cvs command line options are as followed:
#        -n - Do not run any checkout program as specified by the -o
#             option in the modules file
#        -p - Put all output to standard out.
#        -r - The revision of the file that we would like to look at.
#        -ko - Get the revision exactly as checked in - do not allow
#              RCS keyword substitution.
#      Please note that cvs will return 0 for being able to successfully
#      read the file and 1 for failure to read the file.
#
######################################################################
sub check_revision
{
    my( $file, $revision, $finfo, $rinfo ) = @_;
	$file =~ s/(["\$`\\])/\\$1/g;
	# Allow binaries to be checked out as such.  Otherwise, use -ko to avoid
	# replacing keywords in the files.
	my $kwmode = $finfo->{'kwmode'} eq 'b' ? '' : ' -ko';
    my $command = "cvs -d $cvsroot co$kwmode -npr $revision \"$file\"";
	my $ret_code;
	verbose( "Executing `$command'.\n" );
	if( @scripts )
	{
    	my $fh = new IO::File $command . ($verbose ? "" : " 2>&1") . " |";
		fcntl( $fh, F_SETFD, 0 )
			or die "Can't clear close-on-exec flag on filehandle: $!";
		my $count;
		foreach my $script (@scripts)
		{
			$count++;
			if( $script !~ /\n/ && -x $script )
			{
				# exec external script
				my $status = system $script, $cvsroot, $file, $revision,
				                    fileno( $fh );
				warn "`$script $cvsroot $file $revision "
				     . fileno( $fh )
				     . "' exited with code $status"
					if $status;
			}
			else
			{
				# eval script
				@_ = ($cvsroot, $file, $revision, $fh);
				eval $script;
				warn "script $count ($cvsroot, $file, $revision, $fh) exited abnormally: $@"
					if $@;
			}
		}
		# Read any data left so the close will work even if our called script
		# didn't finish reading the data.
		() = $fh->getlines;		# force list context
		die "Syserr closing pipe from `cvs co': $!"
			if !$fh->close && $!;
		$ret_code = $?;
	}
	else
	{
    	$ret_code = 0xffff & system "$command >/dev/null 2>&1";
	}
    return !$ret_code;
}
######################################################################
#
#    NAME :
#      find_interesting_revisions
#
#    PURPOSE :
#      CVS stores information in a logical manner.  We only really
#      need to look at some interestin revisions.  These are:
#      The first version
#      And the last version on every branch.
#      This is because cvs stores changes descending from 
#      main line. ie suppose the last version on mainline is 1.6
#      version 1.6 of the file is stored in toto.  version 1.5
#      is stored as a diff between 1.5 and 1.6.  1.4 is stored 
#      as a diff between 1.5 and 1.4.
#      branches are stored a little differently.  They are 
#      stored in ascending order.  Suppose there is a branch
#      on 1.4 of the file.  The first branches revision number
#      would be 1.4.1.1.  This is stored as a diff between 
#      version 1.4 and 1.4.1.1.  The 1.4.1.2 version is stored
#      as a diff between 1.4.1.1 and 1.4.1.2.  Therefore
#      we are only interested in the earliest revision number
#      and the highest revision number on a branch.
#
#    PARAMETERS :
#      @revisions - The list of revisions to find interesting ones
#
#    GLOBALS :
#      NONE
#
#    RETURNS :
#      @new_revisions - The list of revisions that we find interesting
#
#    COMMENTS :
#
######################################################################
sub find_interesting_revisions
{
    my( @revisions ) = @_;
    my @new_revisions;
    my %max_branch_revision;
    my $branch_number;
    my $branch_rev;
    my $key;
    my $value;
    foreach my $revision( @revisions )
    {
        ( $branch_number, $branch_rev ) = branch_split( $revision );
		$max_branch_revision{$branch_number} = $branch_rev
			if( !exists $max_branch_revision{$branch_number}
				|| $max_branch_revision{$branch_number} < $branch_rev );
	}
	push( @new_revisions, "1.1" ) unless (exists $max_branch_revision{1}
					      && $max_branch_revision{1} == 1);
    while( ( $key, $value ) = each ( %max_branch_revision ) )
    {
        push( @new_revisions, $key . "." . $value );
    }
    my $nrc;
    my $rc;
    $rc = @revisions;
    $nrc = @new_revisions;
    $total_revisions += $rc;
    $total_interesting_revisions += $nrc;
    verbose( "\t\tTotal Revisions: $rc Interesting Revisions: $nrc\n" );
    return( @new_revisions );
}
######################################################################
#
#    NAME :
#      branch_split
#
#    PURPOSE :
#      To split up a revision number up into the branch part and
#      the number part.  For Instance:
#      1.1.1.1 - is split 1.1.1 and 1
#      2.1     - is split 2 and 1
#      1.3.4.5.7.8 - is split 1.3.4.5.7 and 8
#
#    PARAMETERS :
#      $revision - The revision to look at.
#
#    GLOBALS :
#      NONE
#
#    RETURNS :
#      ( $branch, $revision ) - 
#      $branch - The branch part of the revision number 
#      $revision - The revision part of the revision number
#
#    COMMENTS :
#      NONE
#
######################################################################
sub branch_split
{
    my( $revision ) = @_;
    my $branch;
    my $version;
    my @split_rev;
    my $count;
    @split_rev = split /\./, $revision;
    my $numbers = @split_rev;     
    @split_rev = reverse( @split_rev );
    $branch = pop( @split_rev );
    for( $count = 0; $count < $numbers - 2 ; $count++ )
    {
        $branch .= "." . pop( @split_rev );
    }
    return( $branch, pop( @split_rev ) );
}
######################################################################
#
#    NAME :
#      get_ignore_files_from_cvsroot
#
#    PURPOSE :
#      Retrieve the list of files from the CVSROOT/ directory
#      that should be ignored. 
#      These are the regular files (e.g., commitinfo, loginfo)
#      and those specified in the checkoutlist file.
#
#    PARAMETERS :
#      The CVSROOT
#
#    GLOBALS :
#      NONE
#
#    RETURNS :
#      @ignore - the list of files to ignore
#
#    COMMENTS :
#      NONE
#
######################################################################
sub get_ignore_files_from_cvsroot {
    my( $cvsroot ) = @_;
    my @ignore = (
	               qr{CVS/fileattr$}o,
                   qr{^(./)?CVSROOT/.#[^/]*$}o,
                   qr{^(./)?CVSROOT/checkoutlist$}o,
                   qr{^(./)?CVSROOT/commitinfo$}o,
                   qr{^(./)?CVSROOT/config$}o,
                   qr{^(./)?CVSROOT/cvsignore$}o,
                   qr{^(./)?CVSROOT/cvswrappers$}o,
                   qr{^(./)?CVSROOT/editinfo$}o,
                   qr{^(./)?CVSROOT/history$}o,
                   qr{^(./)?CVSROOT/loginfo$}o,
                   qr{^(./)?CVSROOT/modules$}o,
                   qr{^(./)?CVSROOT/notify$}o,
                   qr{^(./)?CVSROOT/passwd$}o,
                   qr{^(./)?CVSROOT/postadmin$}o,
                   qr{^(./)?CVSROOT/postproxy$}o,
                   qr{^(./)?CVSROOT/posttag$}o,
                   qr{^(./)?CVSROOT/postwatch$}o,
                   qr{^(./)?CVSROOT/preproxy$}o,
                   qr{^(./)?CVSROOT/rcsinfo$}o,
                   qr{^(./)?CVSROOT/readers$}o,
                   qr{^(./)?CVSROOT/taginfo$}o,
                   qr{^(./)?CVSROOT/val-tags$}o,
                   qr{^(./)?CVSROOT/verifymsg$}o,
                   qr{^(./)?CVSROOT/writers$}o
	             );
    my $checkoutlist_file = "$cvsroot/CVSROOT/checkoutlist";
	if( -f $checkoutlist_file && -r $checkoutlist_file )
	{
		my $fh = new IO::File "<$checkoutlist_file"
			or die "Unable to read checkoutlist file ($checkoutlist_file): $!\n";
		my @list = $fh->getlines;
		chomp( @list );
		$fh->close or die( "Unable to close checkoutlist file: $!\n" );
		foreach my $line( @list )
		{
			next if( $line =~ /^#/ || $line =~ /^\s*$/ );
			$line =~ s/^\s*(\S+)(\s+.*)?$/$1/;
			push @ignore, qr{^(./)?CVSROOT/$line$};
		}
	}	
    return @ignore;
}
######
###### Go.
######
exit main @ARGV;
# vim:tabstop=4:shiftwidth=4